From 105a644defa6e868ddac3c7f47c8935e9c561e15 Mon Sep 17 00:00:00 2001 From: VinPPP Date: Thu, 11 Apr 2024 05:25:19 +0000 Subject: [PATCH] deploy: 7b7f942af1ae56f816bee4f5e57e1f0c5f193e87 --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 43523 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 43918 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/cache.json b/cache.json new file mode 100644 index 0000000..401e90d --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-04-03T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2404.02903v1","updated":"2024-04-03T17:59:28Z","published":"2024-04-03T17:59:28Z","title":"LidarDM: Generative LiDAR Simulation in a Generated World","summary":" We present LidarDM, a novel LiDAR generative model capable of producing\nrealistic, layout-aware, physically plausible, and temporally coherent LiDAR\nvideos. LidarDM stands out with two unprecedented capabilities in LiDAR\ngenerative modeling: (i) LiDAR generation guided by driving scenarios, offering\nsignificant potential for autonomous driving simulations, and (ii) 4D LiDAR\npoint cloud generation, enabling the creation of realistic and temporally\ncoherent sequences. At the heart of our model is a novel integrated 4D world\ngeneration framework. Specifically, we employ latent diffusion models to\ngenerate the 3D scene, combine it with dynamic actors to form the underlying 4D\nworld, and subsequently produce realistic sensory observations within this\nvirtual environment. Our experiments indicate that our approach outperforms\ncompeting algorithms in realism, temporal coherency, and layout consistency. We\nadditionally show that LidarDM can be used as a generative world model\nsimulator for training and testing perception models.\n","authors":["Vlas Zyrianov","Henry Che","Zhijian Liu","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02887v1","updated":"2024-04-03T17:42:22Z","published":"2024-04-03T17:42:22Z","title":"Learning Quadrupedal Locomotion via Differentiable Simulation","summary":" The emergence of differentiable simulators enabling analytic gradient\ncomputation has motivated a new wave of learning algorithms that hold the\npotential to significantly increase sample efficiency over traditional\nReinforcement Learning (RL) methods. While recent research has demonstrated\nperformance gains in scenarios with comparatively smooth dynamics and, thus,\nsmooth optimization landscapes, research on leveraging differentiable\nsimulators for contact-rich scenarios, such as legged locomotion, is scarce.\nThis may be attributed to the discontinuous nature of contact, which introduces\nseveral challenges to optimizing with analytic gradients. The purpose of this\npaper is to determine if analytic gradients can be beneficial even in the face\nof contact. Our investigation focuses on the effects of different soft and hard\ncontact models on the learning process, examining optimization challenges\nthrough the lens of contact simulation. We demonstrate the viability of\nemploying analytic gradients to learn physically plausible locomotion skills\nwith a quadrupedal robot using Short-Horizon Actor-Critic (SHAC), a learning\nalgorithm leveraging analytic gradients, and draw a comparison to a\nstate-of-the-art RL algorithm, Proximal Policy Optimization (PPO), to\nunderstand the benefits of analytic gradients.\n","authors":["Clemens Schwarke","Victor Klemm","Jesus Tordesillas","Jean-Pierre Sleiman","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2404.02887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04999v2","updated":"2024-04-03T17:13:29Z","published":"2023-11-08T20:05:21Z","title":"Implicit Neural Representations for Breathing-compensated Volume\n Reconstruction in Robotic Ultrasound","summary":" Ultrasound (US) imaging is widely used in diagnosing and staging abdominal\ndiseases due to its lack of non-ionizing radiation and prevalent availability.\nHowever, significant inter-operator variability and inconsistent image\nacquisition hinder the widespread adoption of extensive screening programs.\nRobotic ultrasound systems have emerged as a promising solution, offering\nstandardized acquisition protocols and the possibility of automated\nacquisition. Additionally, these systems enable access to 3D data via robotic\ntracking, enhancing volumetric reconstruction for improved ultrasound\ninterpretation and precise disease diagnosis. However, the interpretability of\n3D US reconstruction of abdominal images can be affected by the patient's\nbreathing motion. This study introduces a method to compensate for breathing\nmotion in 3D US compounding by leveraging implicit neural representations. Our\napproach employs a robotic ultrasound system for automated screenings. To\ndemonstrate the method's effectiveness, we evaluate our proposed method for the\ndiagnosis and monitoring of abdominal aorta aneurysms as a representative use\ncase. Our experiments demonstrate that our proposed pipeline facilitates robust\nautomated robotic acquisition, mitigating artifacts from breathing motion, and\nyields smoother 3D reconstructions for enhanced screening and medical\ndiagnosis.\n","authors":["Yordanka Velikova","Mohammad Farid Azampour","Walter Simson","Marco Esposito","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2311.04999v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2404.01781v2","updated":"2024-04-03T16:13:10Z","published":"2024-04-02T09:44:38Z","title":"An evaluation of CFEAR Radar Odometry","summary":" This article describes the method CFEAR Radar odometry, submitted to a\ncompetition at the Radar in Robotics workshop, ICRA 20241. CFEAR is an\nefficient and accurate method for spinning 2D radar odometry that generalizes\nwell across environments. This article presents an overview of the odometry\npipeline with new experiments on the public Boreas dataset. We show that a\nreal-time capable configuration of CFEAR - with its original parameter set -\nyields surprisingly low drift in the Boreas dataset. Additionally, we discuss\nan improved implementation and solving strategy that enables the most accurate\nconfiguration to run in real-time with improved robustness, reaching as low as\n0.61% translation drift at a frame rate of 68 Hz. A recent release of the\nsource code is available to the community\nhttps://github.com/dan11003/CFEAR_Radarodometry_code_public, and we publish the\nevaluation from this article on https://github.com/dan11003/cfear_2024_workshop\n","authors":["Daniel Adolfsson","Maximilian Hilger"],"pdf_url":"https://arxiv.org/pdf/2404.01781v2.pdf","comment":"Uppdated with results from test set in Boreas"},{"id":"http://arxiv.org/abs/2404.02817v1","updated":"2024-04-03T15:38:36Z","published":"2024-04-03T15:38:36Z","title":"A Survey of Optimization-based Task and Motion Planning: From Classical\n To Learning Approaches","summary":" Task and Motion Planning (TAMP) integrates high-level task planning and\nlow-level motion planning to equip robots with the autonomy to effectively\nreason over long-horizon, dynamic tasks. Optimization-based TAMP focuses on\nhybrid optimization approaches that define goal conditions via objective\nfunctions and are capable of handling open-ended goals, robotic dynamics, and\nphysical interaction between the robot and the environment. Therefore,\noptimization-based TAMP is particularly suited to solve highly complex,\ncontact-rich locomotion and manipulation problems. This survey provides a\ncomprehensive review on optimization-based TAMP, covering (i) planning domain\nrepresentations, including action description languages and temporal logic,\n(ii) individual solution strategies for components of TAMP, including AI\nplanning and trajectory optimization (TO), and (iii) the dynamic interplay\nbetween logic-based task planning and model-based TO. A particular focus of\nthis survey is to highlight the algorithm structures to efficiently solve TAMP,\nespecially hierarchical and distributed approaches. Additionally, the survey\nemphasizes the synergy between the classical methods and contemporary\nlearning-based innovations such as large language models. Furthermore, the\nfuture research directions for TAMP is discussed in this survey, highlighting\nboth algorithmic and application-specific challenges.\n","authors":["Zhigen Zhao","Shuo Chen","Yan Ding","Ziyi Zhou","Shiqi Zhang","Danfei Xu","Ye Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.02817v1.pdf","comment":"24 pages, 12 figures, submitted for review"},{"id":"http://arxiv.org/abs/2404.02795v1","updated":"2024-04-03T15:02:03Z","published":"2024-04-03T15:02:03Z","title":"Planning for Robust Open-loop Pushing: Exploiting Quasi-static Belief\n Dynamics and Contact-informed Optimization","summary":" Non-prehensile manipulation such as pushing is typically subject to\nuncertain, non-smooth dynamics. However, modeling the uncertainty of the\ndynamics typically results in intractable belief dynamics, making\ndata-efficient planning under uncertainty difficult. This article focuses on\nthe problem of efficiently generating robust open-loop pushing plans. First, we\ninvestigate how the belief over object configurations propagates through\nquasi-static contact dynamics. We exploit the simplified dynamics to predict\nthe variance of the object configuration without sampling from a perturbation\ndistribution. In a sampling-based trajectory optimization algorithm, the gain\nof the variance is constrained in order to enforce robustness of the plan.\nSecond, we propose an informed trajectory sampling mechanism for drawing robot\ntrajectories that are likely to make contact with the object. This sampling\nmechanism is shown to significantly improve chances of finding robust\nsolutions, especially when making-and-breaking contacts is required. We\ndemonstrate that the proposed approach is able to synthesize bi-manual pushing\ntrajectories, resulting in successful long-horizon pushing maneuvers without\nexteroceptive feedback such as vision or tactile feedback.\n","authors":["Julius Jankowski","Lara Brudermüller","Nick Hawes","Sylvain Calinon"],"pdf_url":"https://arxiv.org/pdf/2404.02795v1.pdf","comment":"submitted to the International Journal of Robotics Research (IJRR)"},{"id":"http://arxiv.org/abs/2404.02771v1","updated":"2024-04-03T14:37:00Z","published":"2024-04-03T14:37:00Z","title":"Forming Large Patterns with Local Robots in the OBLOT Model","summary":" In the arbitrary pattern formation problem, $n$ autonomous, mobile robots\nmust form an arbitrary pattern $P \\subseteq \\mathbb{R}^2$. The (deterministic)\nrobots are typically assumed to be indistinguishable, disoriented, and unable\nto communicate. An important distinction is whether robots have memory and/or a\nlimited viewing range. Previous work managed to form $P$ under a natural\nsymmetry condition if robots have no memory but an unlimited viewing range [22]\nor if robots have a limited viewing range but memory [25]. In the latter case,\n$P$ is only formed in a shrunk version that has constant diameter.\n Without memory and with limited viewing range, forming arbitrary patterns\nremains an open problem. We provide a partial solution by showing that $P$ can\nbe formed under the same symmetry condition if the robots' initial diameter is\n$\\leq 1$. Our protocol partitions $P$ into rotation-symmetric components and\nexploits the initial mutual visibility to form one cluster per component. Using\na careful placement of the clusters and their robots, we show that a cluster\ncan move in a coordinated way through its component while drawing $P$ by\ndropping one robot per pattern coordinate.\n","authors":["Christopher Hahn","Jonas Harbig","Peter Kling"],"pdf_url":"https://arxiv.org/pdf/2404.02771v1.pdf","comment":"24 pages, 3 figures, submitted for SAND 2024, version with extended\n appendix"},{"id":"http://arxiv.org/abs/2404.02728v1","updated":"2024-04-03T13:28:52Z","published":"2024-04-03T13:28:52Z","title":"Unsupervised Learning of Effective Actions in Robotics","summary":" Learning actions that are relevant to decision-making and can be executed\neffectively is a key problem in autonomous robotics. Current state-of-the-art\naction representations in robotics lack proper effect-driven learning of the\nrobot's actions. Although successful in solving manipulation tasks, deep\nlearning methods also lack this ability, in addition to their high cost in\nterms of memory or training data. In this paper, we propose an unsupervised\nalgorithm to discretize a continuous motion space and generate \"action\nprototypes\", each producing different effects in the environment. After an\nexploration phase, the algorithm automatically builds a representation of the\neffects and groups motions into action prototypes, where motions more likely to\nproduce an effect are represented more than those that lead to negligible\nchanges. We evaluate our method on a simulated stair-climbing reinforcement\nlearning task, and the preliminary results show that our effect driven\ndiscretization outperforms uniformly and randomly sampled discretizations in\nconvergence speed and maximum reward.\n","authors":["Marko Zaric","Jakob Hollenstein","Justus Piater","Erwan Renaudo"],"pdf_url":"https://arxiv.org/pdf/2404.02728v1.pdf","comment":"Accepted at The First Austrian Symposium on AI, Robotics, and Vision\n (AIROV24)"},{"id":"http://arxiv.org/abs/2312.09056v2","updated":"2024-04-03T13:09:27Z","published":"2023-12-14T15:53:07Z","title":"ReCoRe: Regularized Contrastive Representation Learning of World Model","summary":" While recent model-free Reinforcement Learning (RL) methods have demonstrated\nhuman-level effectiveness in gaming environments, their success in everyday\ntasks like visual navigation has been limited, particularly under significant\nappearance variations. This limitation arises from (i) poor sample efficiency\nand (ii) over-fitting to training scenarios. To address these challenges, we\npresent a world model that learns invariant features using (i) contrastive\nunsupervised learning and (ii) an intervention-invariant regularizer. Learning\nan explicit representation of the world dynamics i.e. a world model, improves\nsample efficiency while contrastive learning implicitly enforces learning of\ninvariant features, which improves generalization. However, the na\\\"ive\nintegration of contrastive loss to world models is not good enough, as\nworld-model-based RL methods independently optimize representation learning and\nagent policy. To overcome this issue, we propose an intervention-invariant\nregularizer in the form of an auxiliary task such as depth prediction, image\ndenoising, image segmentation, etc., that explicitly enforces invariance to\nstyle interventions. Our method outperforms current state-of-the-art\nmodel-based and model-free RL methods and significantly improves on\nout-of-distribution point navigation tasks evaluated on the iGibson benchmark.\nWith only visual observations, we further demonstrate that our approach\noutperforms recent language-guided foundation models for point navigation,\nwhich is essential for deployment on robots with limited computation\ncapabilities. Finally, we demonstrate that our proposed model excels at the\nsim-to-real transfer of its perception module on the Gibson benchmark.\n","authors":["Rudra P. K. Poudel","Harit Pandya","Stephan Liwicki","Roberto Cipolla"],"pdf_url":"https://arxiv.org/pdf/2312.09056v2.pdf","comment":"Accepted at CVPR 2024. arXiv admin note: text overlap with\n arXiv:2209.14932"},{"id":"http://arxiv.org/abs/2309.12685v2","updated":"2024-04-03T12:47:15Z","published":"2023-09-22T07:51:17Z","title":"eWand: A calibration framework for wide baseline frame-based and\n event-based camera systems","summary":" Accurate calibration is crucial for using multiple cameras to triangulate the\nposition of objects precisely. However, it is also a time-consuming process\nthat needs to be repeated for every displacement of the cameras. The standard\napproach is to use a printed pattern with known geometry to estimate the\nintrinsic and extrinsic parameters of the cameras. The same idea can be applied\nto event-based cameras, though it requires extra work. By using frame\nreconstruction from events, a printed pattern can be detected. A blinking\npattern can also be displayed on a screen. Then, the pattern can be directly\ndetected from the events. Such calibration methods can provide accurate\nintrinsic calibration for both frame- and event-based cameras. However, using\n2D patterns has several limitations for multi-camera extrinsic calibration,\nwith cameras possessing highly different points of view and a wide baseline.\nThe 2D pattern can only be detected from one direction and needs to be of\nsignificant size to compensate for its distance to the camera. This makes the\nextrinsic calibration time-consuming and cumbersome. To overcome these\nlimitations, we propose eWand, a new method that uses blinking LEDs inside\nopaque spheres instead of a printed or displayed pattern. Our method provides a\nfaster, easier-to-use extrinsic calibration approach that maintains high\naccuracy for both event- and frame-based cameras.\n","authors":["Thomas Gossard","Andreas Ziegler","Levin Kolmar","Jonas Tebbe","Andreas Zell"],"pdf_url":"https://arxiv.org/pdf/2309.12685v2.pdf","comment":"Accepted for 2024 IEEE International Conference on Robotics and\n Automation (ICRA 2024). Project web page:\n https://cogsys-tuebingen.github.io/ewand/"},{"id":"http://arxiv.org/abs/2404.02645v1","updated":"2024-04-03T11:15:56Z","published":"2024-04-03T11:15:56Z","title":"One Stack to Rule them All: To Drive Automated Vehicles, and Reach for\n the 4th level","summary":" Most automated driving functions are designed for a specific task or vehicle.\nMost often, the underlying architecture is fixed to specific algorithms to\nincrease performance. Therefore, it is not possible to deploy new modules and\nalgorithms easily. In this paper, we present our automated driving stack which\ncombines both scalability and adaptability. Due to the modular design, our\nstack allows for a fast integration and testing of novel and state-of-the-art\nresearch approaches. Furthermore, it is flexible to be used for our different\ntesting vehicles, including modified EasyMile EZ10 shuttles and different\npassenger cars. These vehicles differ in multiple ways, e.g. sensor setups,\ncontrol systems, maximum speed, or steering angle limitations. Finally, our\nstack is deployed in real world environments, including passenger transport in\nurban areas. Our stack includes all components needed for operating an\nautonomous vehicle, including localization, perception, planning, controller,\nand additional safety modules. Our stack is developed, tested, and evaluated in\nreal world traffic in multiple test sites, including the Test Area Autonomous\nDriving Baden-W\\\"urttemberg.\n","authors":["Sven Ochs","Jens Doll","Daniel Grimm","Tobias Fleck","Marc Heinrich","Stefan Orf","Albert Schotschneider","Helen Gremmelmaier","Rupert Polley","Svetlana Pavlitska","Maximilian Zipfl","Helen Schneider","Ferdinand Mütsch","Daniel Bogdoll","Florian Kuhnt","Philip Schörner","Marc René Zofka","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2404.02645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02644v1","updated":"2024-04-03T11:14:37Z","published":"2024-04-03T11:14:37Z","title":"Leveraging Swarm Intelligence to Drive Autonomously: A Particle Swarm\n Optimization based Approach to Motion Planning","summary":" Motion planning is an essential part of autonomous mobile platforms. A good\npipeline should be modular enough to handle different vehicles, environments,\nand perception modules. The planning process has to cope with all the different\nmodalities and has to have a modular and flexible design. But most importantly,\nit has to be safe and robust. In this paper, we want to present our motion\nplanning pipeline with particle swarm optimization (PSO) at its core. This\nsolution is independent of the vehicle type and has a clear and\nsimple-to-implement interface for perception modules. Moreover, the approach\nstands out for being easily adaptable to new scenarios. Parallel calculation\nallows for fast planning cycles. Following the principles of PSO, the\ntrajectory planer first generates a swarm of initial trajectories that are\noptimized afterward. We present the underlying control space and inner\nworkings. Finally, the application to real-world automated driving is shown in\nthe evaluation with a deeper look at the modeling of the cost function. The\napproach is used in our automated shuttles that have already driven more than\n3.500 km safely and entirely autonomously in sub-urban everyday traffic.\n","authors":["Sven Ochs","Jens Doll","Marc Heinrich","Philip Schörner","Sebastian Klemm","Marc René Zofka","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2404.02644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07739v2","updated":"2024-04-03T09:50:54Z","published":"2024-02-12T15:57:31Z","title":"Task-conditioned adaptation of visual features in multi-task policy\n learning","summary":" Successfully addressing a wide variety of tasks is a core ability of\nautonomous agents, requiring flexibly adapting the underlying decision-making\nstrategies and, as we argue in this work, also adapting the perception modules.\nAn analogical argument would be the human visual system, which uses top-down\nsignals to focus attention determined by the current task. Similarly, we adapt\npre-trained large vision models conditioned on specific downstream tasks in the\ncontext of multi-task policy learning. We introduce task-conditioned adapters\nthat do not require finetuning any pre-trained weights, combined with a single\npolicy trained with behavior cloning and capable of addressing multiple tasks.\nWe condition the visual adapters on task embeddings, which can be selected at\ninference if the task is known, or alternatively inferred from a set of example\ndemonstrations. To this end, we propose a new optimization-based estimator. We\nevaluate the method on a wide variety of tasks from the CortexBench benchmark\nand show that, compared to existing work, it can be addressed with a single\npolicy. In particular, we demonstrate that adapting visual features is a key\ndesign choice and that the method generalizes to unseen tasks given a few\ndemonstrations.\n","authors":["Pierre Marza","Laetitia Matignon","Olivier Simonin","Christian Wolf"],"pdf_url":"https://arxiv.org/pdf/2402.07739v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02599v1","updated":"2024-04-03T09:33:47Z","published":"2024-04-03T09:33:47Z","title":"Determining the Tactical Challenge of Scenarios to Efficiently Test\n Automated Driving Systems","summary":" The selection of relevant test scenarios for the scenario-based testing and\nsafety validation of automated driving systems (ADSs) remains challenging. An\nimportant aspect of the relevance of a scenario is the challenge it poses for\nan ADS. Existing methods for calculating the challenge of a scenario aim to\nexpress the challenge in terms of a metric value. Metric values are useful to\nselect the least or most challenging scenario. However, they fail to provide\nhuman-interpretable information on the cause of the challenge which is critical\ninformation for the efficient selection of relevant test scenarios. Therefore,\nthis paper presents the Challenge Description Method that mitigates this issue\nby analyzing scenarios and providing a description of their challenge in terms\nof the minimum required lane changes and their difficulty. Applying the method\nto different highway scenarios showed that it is capable of analyzing complex\nscenarios and providing easy-to-understand descriptions that can be used to\nselect relevant test scenarios.\n","authors":["Lennart Vater","Sven Tarlowski","Lutz Eckstein"],"pdf_url":"https://arxiv.org/pdf/2404.02599v1.pdf","comment":"6 pages, 3 figures, 2 tables; Accepted to be published as part of the\n 35th IEEE Intelligent Vehicles Symposium (IV), Jeju Shinhwa World, Jeju\n Island, Korea, June 2-5, 2024"},{"id":"http://arxiv.org/abs/2404.01900v2","updated":"2024-04-03T08:42:49Z","published":"2024-04-02T12:39:44Z","title":"Automatic Derivation of an Optimal Task Frame for Learning and\n Controlling Contact-Rich Tasks","summary":" This study investigates learning from demonstration (LfD) for contact-rich\ntasks. The procedure for choosing a task frame to express the learned signals\nfor the motion and interaction wrench is often omitted or using expert insight.\nThis article presents a procedure to derive the optimal task frame from motion\nand wrench data recorded during the demonstration. The procedure is based on\ntwo principles that are hypothesized to underpin the control configuration\ntargeted by an expert, and assumes task frame origins and orientations that are\nfixed to either the world or the robot tool. It is rooted in screw theory, is\nentirely probabilistic and does not involve any hyperparameters. The procedure\nwas validated by demonstrating several tasks, including surface following and\nmanipulation of articulated objects, showing good agreement between the\nobtained and the assumed expert task frames. To validate the performance of the\nlearned tasks by a UR10e robot, a constraint-based controller was designed\nbased on the derived task frames and the learned data expressed therein. These\nexperiments showed the effectiveness and versatility of the proposed approach.\nThe task frame derivation approach fills a gap in the state of the art of LfD,\nbringing LfD for contact-rich tasks closer to practical application.\n","authors":["Ali Mousavi Mohammadi","Maxim Vochten","Erwin Aertbeliën","Joris De Schutter"],"pdf_url":"https://arxiv.org/pdf/2404.01900v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02569v1","updated":"2024-04-03T08:42:36Z","published":"2024-04-03T08:42:36Z","title":"SliceIt! -- A Dual Simulator Framework for Learning Robot Food Slicing","summary":" Cooking robots can enhance the home experience by reducing the burden of\ndaily chores. However, these robots must perform their tasks dexterously and\nsafely in shared human environments, especially when handling dangerous tools\nsuch as kitchen knives. This study focuses on enabling a robot to autonomously\nand safely learn food-cutting tasks. More specifically, our goal is to enable a\ncollaborative robot or industrial robot arm to perform food-slicing tasks by\nadapting to varying material properties using compliance control. Our approach\ninvolves using Reinforcement Learning (RL) to train a robot to compliantly\nmanipulate a knife, by reducing the contact forces exerted by the food items\nand by the cutting board. However, training the robot in the real world can be\ninefficient, and dangerous, and result in a lot of food waste. Therefore, we\nproposed SliceIt!, a framework for safely and efficiently learning robot\nfood-slicing tasks in simulation. Following a real2sim2real approach, our\nframework consists of collecting a few real food slicing data, calibrating our\ndual simulation environment (a high-fidelity cutting simulator and a robotic\nsimulator), learning compliant control policies on the calibrated simulation\nenvironment, and finally, deploying the policies on the real robot.\n","authors":["Cristian C. Beltran-Hernandez","Nicolas Erbetti","Masashi Hamaya"],"pdf_url":"https://arxiv.org/pdf/2404.02569v1.pdf","comment":"Accepted to ICRA 2024"},{"id":"http://arxiv.org/abs/2404.02567v1","updated":"2024-04-03T08:40:33Z","published":"2024-04-03T08:40:33Z","title":"Fusing Multi-sensor Input with State Information on TinyML Brains for\n Autonomous Nano-drones","summary":" Autonomous nano-drones (~10 cm in diameter), thanks to their ultra-low power\nTinyML-based brains, are capable of coping with real-world environments.\nHowever, due to their simplified sensors and compute units, they are still far\nfrom the sense-and-act capabilities shown in their bigger counterparts. This\nsystem paper presents a novel deep learning-based pipeline that fuses\nmulti-sensorial input (i.e., low-resolution images and 8x8 depth map) with the\nrobot's state information to tackle a human pose estimation task. Thanks to our\ndesign, the proposed system -- trained in simulation and tested on a real-world\ndataset -- improves a state-unaware State-of-the-Art baseline by increasing the\nR^2 regression metric up to 0.10 on the distance's prediction.\n","authors":["Luca Crupi","Elia Cereda","Daniele Palossi"],"pdf_url":"https://arxiv.org/pdf/2404.02567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02524v1","updated":"2024-04-03T07:26:15Z","published":"2024-04-03T07:26:15Z","title":"Versatile Scene-Consistent Traffic Scenario Generation as Optimization\n with Diffusion","summary":" Generating realistic and controllable agent behaviors in traffic simulation\nis crucial for the development of autonomous vehicles. This problem is often\nformulated as imitation learning (IL) from real-world driving data by either\ndirectly predicting future trajectories or inferring cost functions with\ninverse optimal control. In this paper, we draw a conceptual connection between\nIL and diffusion-based generative modeling and introduce a novel framework\nVersatile Behavior Diffusion (VBD) to simulate interactive scenarios with\nmultiple traffic participants. Our model not only generates scene-consistent\nmulti-agent interactions but also enables scenario editing through multi-step\nguidance and refinement. Experimental evaluations show that VBD achieves\nstate-of-the-art performance on the Waymo Sim Agents benchmark. In addition, we\nillustrate the versatility of our model by adapting it to various applications.\nVBD is capable of producing scenarios conditioning on priors, integrating with\nmodel-based optimization, sampling multi-modal scene-consistent scenarios by\nfusing marginal predictions, and generating safety-critical scenarios when\ncombined with a game-theoretic solver.\n","authors":["Zhiyu Huang","Zixu Zhang","Ameya Vaidya","Yuxiao Chen","Chen Lv","Jaime Fernández Fisac"],"pdf_url":"https://arxiv.org/pdf/2404.02524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02516v1","updated":"2024-04-03T07:09:24Z","published":"2024-04-03T07:09:24Z","title":"On-the-Go Tree Detection and Geometric Traits Estimation with Ground\n Mobile Robots in Fruit Tree Groves","summary":" By-tree information gathering is an essential task in precision agriculture\nachieved by ground mobile sensors, but it can be time- and labor-intensive. In\nthis paper we present an algorithmic framework to perform real-time and\non-the-go detection of trees and key geometric characteristics (namely, width\nand height) with wheeled mobile robots in the field. Our method is based on the\nfusion of 2D domain-specific data (normalized difference vegetation index\n[NDVI] acquired via a red-green-near-infrared [RGN] camera) and 3D LiDAR point\nclouds, via a customized tree landmark association and parameter estimation\nalgorithm. The proposed system features a multi-modal and entropy-based\nlandmark correspondences approach, integrated into an underlying Kalman filter\nsystem to recognize the surrounding trees and jointly estimate their spatial\nand vegetation-based characteristics. Realistic simulated tests are used to\nevaluate our proposed algorithm's behavior in a variety of settings. Physical\nexperiments in agricultural fields help validate our method's efficacy in\nacquiring accurate by-tree information on-the-go and in real-time by employing\nonly onboard computational and sensing resources.\n","authors":["Dimitrios Chatziparaschis","Hanzhe Teng","Yipeng Wang","Pamodya Peiris","Elia Scudiero","Konstantinos Karydis"],"pdf_url":"https://arxiv.org/pdf/2404.02516v1.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.02515v1","updated":"2024-04-03T07:07:29Z","published":"2024-04-03T07:07:29Z","title":"Tightly-Coupled LiDAR-IMU-Wheel Odometry with Online Calibration of a\n Kinematic Model for Skid-Steering Robots","summary":" Tunnels and long corridors are challenging environments for mobile robots\nbecause a LiDAR point cloud should degenerate in these environments. To tackle\npoint cloud degeneration, this study presents a tightly-coupled LiDAR-IMU-wheel\nodometry algorithm with an online calibration for skid-steering robots. We\npropose a full linear wheel odometry factor, which not only serves as a motion\nconstraint but also performs the online calibration of kinematic models for\nskid-steering robots. Despite the dynamically changing kinematic model (e.g.,\nwheel radii changes caused by tire pressures) and terrain conditions, our\nmethod can address the model error via online calibration. Moreover, our method\nenables an accurate localization in cases of degenerated environments, such as\nlong and straight corridors, by calibration while the LiDAR-IMU fusion\nsufficiently operates. Furthermore, we estimate the uncertainty (i.e.,\ncovariance matrix) of the wheel odometry online for creating a reasonable\nconstraint. The proposed method is validated through three experiments. The\nfirst indoor experiment shows that the proposed method is robust in severe\ndegeneracy cases (long corridors) and changes in the wheel radii. The second\noutdoor experiment demonstrates that our method accurately estimates the sensor\ntrajectory despite being in rough outdoor terrain owing to online uncertainty\nestimation of wheel odometry. The third experiment shows the proposed online\ncalibration enables robust odometry estimation in changing terrains.\n","authors":["Taku Okawara","Kenji Koide","Shuji Oishi","Masashi Yokozuka","Atsuhiko Banno","Kentaro Uno","Kazuya Yoshida"],"pdf_url":"https://arxiv.org/pdf/2404.02515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02472v1","updated":"2024-04-03T05:28:44Z","published":"2024-04-03T05:28:44Z","title":"Safe Returning FaSTrack with Robust Control Lyapunov-Value Functions","summary":" Real-time navigation in a priori unknown environment remains a challenging\ntask, especially when an unexpected (unmodeled) disturbance occurs. In this\npaper, we propose the framework Safe Returning Fast and Safe Tracking (SR-F)\nthat merges concepts from 1) Robust Control Lyapunov-Value Functions (R-CLVF),\nand 2) the Fast and Safe Tracking (FaSTrack) framework. The SR-F computes an\nR-CLVF offline between a model of the true system and a simplified planning\nmodel. Online, a planning algorithm is used to generate a trajectory in the\nsimplified planning space, and the R-CLVF is used to provide a tracking\ncontroller that exponentially stabilizes to the planning model. When an\nunexpected disturbance occurs, the proposed SR-F algorithm provides a means for\nthe true system to recover to the planning model. We take advantage of this\nmechanism to induce an artificial disturbance by ``jumping'' the planning model\nin open environments, forcing faster navigation. Therefore, this algorithm can\nboth reject unexpected true disturbances and accelerate navigation speed. We\nvalidate our framework using a 10D quadrotor system and show that SR-F is\nempirically 20\\% faster than the original FaSTrack while maintaining safety.\n","authors":["Zheng Gong","Boyang Li","Sylvia Herbert"],"pdf_url":"https://arxiv.org/pdf/2404.02472v1.pdf","comment":"6 pages, 4 figures, 1 table, 2 algorithms. Submitted to LCSS on 03/06"},{"id":"http://arxiv.org/abs/2301.02075v3","updated":"2024-04-03T04:10:50Z","published":"2023-01-05T14:18:55Z","title":"Beyond Inverted Pendulums: Task-optimal Simple Models of Legged\n Locomotion","summary":" Reduced-order models (ROM) are popular in online motion planning due to their\nsimplicity. A good ROM for control captures critical task-relevant aspects of\nthe full dynamics while remaining low dimensional. However, planning within the\nreduced-order space unavoidably constrains the full model, and hence we\nsacrifice the full potential of the robot. In the community of legged\nlocomotion, this has lead to a search for better model extensions, but many of\nthese extensions require human intuition, and there has not existed a\nprincipled way of evaluating the model performance and discovering new models.\nIn this work, we propose a model optimization algorithm that automatically\nsynthesizes reduced-order models, optimal with respect to a user-specified\ndistribution of tasks and corresponding cost functions. To demonstrate our\nwork, we optimized models for a bipedal robot Cassie. We show in simulation\nthat the optimal ROM reduces the cost of Cassie's joint torques by up to 23%\nand increases its walking speed by up to 54%. We also show hardware result that\nthe real robot walks on flat ground with 10% lower torque cost. All videos and\ncode can be found at https://sites.google.com/view/ymchen/research/optimal-rom.\n","authors":["Yu-Ming Chen","Jianshu Hu","Michael Posa"],"pdf_url":"https://arxiv.org/pdf/2301.02075v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02407v1","updated":"2024-04-03T02:17:34Z","published":"2024-04-03T02:17:34Z","title":"Decision Transformer as a Foundation Model for Partially Observable\n Continuous Control","summary":" Closed-loop control of nonlinear dynamical systems with partial-state\nobservability demands expert knowledge of a diverse, less standardized set of\ntheoretical tools. Moreover, it requires a delicate integration of controller\nand estimator designs to achieve the desired system behavior. To establish a\ngeneral controller synthesis framework, we explore the Decision Transformer\n(DT) architecture. Specifically, we first frame the control task as predicting\nthe current optimal action based on past observations, actions, and rewards,\neliminating the need for a separate estimator design. Then, we leverage the\npre-trained language models, i.e., the Generative Pre-trained Transformer (GPT)\nseries, to initialize DT and subsequently train it for control tasks using\nlow-rank adaptation (LoRA). Our comprehensive experiments across five distinct\ncontrol tasks, ranging from maneuvering aerospace systems to controlling\npartial differential equations (PDEs), demonstrate DT's capability to capture\nthe parameter-agnostic structures intrinsic to control tasks. DT exhibits\nremarkable zero-shot generalization abilities for completely new tasks and\nrapidly surpasses expert performance levels with a minimal amount of\ndemonstration data. These findings highlight the potential of DT as a\nfoundational controller for general control applications.\n","authors":["Xiangyuan Zhang","Weichao Mao","Haoran Qiu","Tamer Başar"],"pdf_url":"https://arxiv.org/pdf/2404.02407v1.pdf","comment":"Submitted to CDC 2024"},{"id":"http://arxiv.org/abs/2402.03678v3","updated":"2024-04-03T00:45:12Z","published":"2024-02-06T04:00:21Z","title":"Logical Specifications-guided Dynamic Task Sampling for Reinforcement\n Learning Agents","summary":" Reinforcement Learning (RL) has made significant strides in enabling\nartificial agents to learn diverse behaviors. However, learning an effective\npolicy often requires a large number of environment interactions. To mitigate\nsample complexity issues, recent approaches have used high-level task\nspecifications, such as Linear Temporal Logic (LTL$_f$) formulas or Reward\nMachines (RM), to guide the learning progress of the agent. In this work, we\npropose a novel approach, called Logical Specifications-guided Dynamic Task\nSampling (LSTS), that learns a set of RL policies to guide an agent from an\ninitial state to a goal state based on a high-level task specification, while\nminimizing the number of environmental interactions. Unlike previous work, LSTS\ndoes not assume information about the environment dynamics or the Reward\nMachine, and dynamically samples promising tasks that lead to successful goal\npolicies. We evaluate LSTS on a gridworld and show that it achieves improved\ntime-to-threshold performance on complex sequential decision-making problems\ncompared to state-of-the-art RM and Automaton-guided RL baselines, such as\nQ-Learning for Reward Machines and Compositional RL from logical Specifications\n(DIRL). Moreover, we demonstrate that our method outperforms RM and\nAutomaton-guided RL baselines in terms of sample-efficiency, both in a\npartially observable robotic task and in a continuous control robotic\nmanipulation task.\n","authors":["Yash Shukla","Tanushree Burman","Abhishek Kulkarni","Robert Wright","Alvaro Velasquez","Jivko Sinapov"],"pdf_url":"https://arxiv.org/pdf/2402.03678v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03103v1","updated":"2024-04-03T23:03:53Z","published":"2024-04-03T23:03:53Z","title":"Multi-Robot Planning for Filming Groups of Moving Actors Leveraging\n Submodularity and Pixel Density","summary":" Observing and filming a group of moving actors with a team of aerial robots\nis a challenging problem that combines elements of multi-robot coordination,\ncoverage, and view planning. A single camera may observe multiple actors at\nonce, and the robot team may observe individual actors from multiple views. As\nactors move about, groups may split, merge, and reform, and robots filming\nthese actors should be able to adapt smoothly to such changes in actor\nformations. Rather than adopt an approach based on explicit formations or\nassignments, we propose an approach based on optimizing views directly. We\nmodel actors as moving polyhedra and compute approximate pixel densities for\neach face and camera view. Then, we propose an objective that exhibits\ndiminishing returns as pixel densities increase from repeated observation. This\ngives rise to a multi-robot perception planning problem which we solve via a\ncombination of value iteration and greedy submodular maximization. %using a\ncombination of value iteration to optimize views for individual robots and\nsequential submodular maximization methods to coordinate the team. We evaluate\nour approach on challenging scenarios modeled after various kinds of social\nbehaviors and featuring different numbers of robots and actors and observe that\nrobot assignments and formations arise implicitly based on the movements of\ngroups of actors. Simulation results demonstrate that our approach consistently\noutperforms baselines, and in addition to performing well with the planner's\napproximation of pixel densities our approach also performs comparably for\nevaluation based on rendered views. Overall, the multi-round variant of the\nsequential planner we propose meets (within 1%) or exceeds the formation and\nassignment baselines in all scenarios we consider.\n","authors":["Skyler Hughes","Rebecca Martin","Micah Corah","Sebastian Scherer"],"pdf_url":"https://arxiv.org/pdf/2404.03103v1.pdf","comment":"10 pages, 5 figures, submitted to CDC 2024"},{"id":"http://arxiv.org/abs/2404.03094v1","updated":"2024-04-03T22:16:49Z","published":"2024-04-03T22:16:49Z","title":"Low Frequency Sampling in Model Predictive Path Integral Control","summary":" Sampling-based model-predictive controllers have become a powerful\noptimization tool for planning and control problems in various challenging\nenvironments. In this paper, we show how the default choice of uncorrelated\nGaussian distributions can be improved upon with the use of a colored noise\ndistribution. Our choice of distribution allows for the emphasis on low\nfrequency control signals, which can result in smoother and more exploratory\nsamples. We use this frequency-based sampling distribution with Model\nPredictive Path Integral (MPPI) in both hardware and simulation experiments to\nshow better or equal performance on systems with various speeds of input\nresponse.\n","authors":["Bogdan Vlahov","Jason Gibson","David D. Fan","Patrick Spieler","Ali-akbar Agha-mohammadi","Evangelos A. Theodorou"],"pdf_url":"https://arxiv.org/pdf/2404.03094v1.pdf","comment":"Accepted to RA-L"},{"id":"http://arxiv.org/abs/2404.03092v1","updated":"2024-04-03T22:13:04Z","published":"2024-04-03T22:13:04Z","title":"Unsupervised, Bottom-up Category Discovery for Symbol Grounding with a\n Curious Robot","summary":" Towards addressing the Symbol Grounding Problem and motivated by early\nchildhood language development, we leverage a robot which has been equipped\nwith an approximate model of curiosity with particular focus on bottom-up\nbuilding of unsupervised categories grounded in the physical world. That is,\nrather than starting with a top-down symbol (e.g., a word referring to an\nobject) and providing meaning through the application of predetermined samples,\nthe robot autonomously and gradually breaks up its exploration space into a\nseries of increasingly specific unlabeled categories at which point an external\nexpert may optionally provide a symbol association. We extend prior work by\nusing a robot that can observe the visual world, introducing a higher\ndimensional sensory space, and using a more generalizable method of category\nbuilding. Our experiments show that the robot learns categories based on\nactions and what it visually observes, and that those categories can be\nsymbolically grounded into.https://info.arxiv.org/help/prep#comments\n","authors":["Catherine Henry","Casey Kennington"],"pdf_url":"https://arxiv.org/pdf/2404.03092v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.03067v1","updated":"2024-04-03T21:16:19Z","published":"2024-04-03T21:16:19Z","title":"Self-supervised 6-DoF Robot Grasping by Demonstration via Augmented\n Reality Teleoperation System","summary":" Most existing 6-DoF robot grasping solutions depend on strong supervision on\ngrasp pose to ensure satisfactory performance, which could be laborious and\nimpractical when the robot works in some restricted area. To this end, we\npropose a self-supervised 6-DoF grasp pose detection framework via an Augmented\nReality (AR) teleoperation system that can efficiently learn human\ndemonstrations and provide 6-DoF grasp poses without grasp pose annotations.\nSpecifically, the system collects the human demonstration from the AR\nenvironment and contrastively learns the grasping strategy from the\ndemonstration. For the real-world experiment, the proposed system leads to\nsatisfactory grasping abilities and learning to grasp unknown objects within\nthree demonstrations.\n","authors":["Xiwen Dengxiong","Xueting Wang","Shi Bai","Yunbo Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.03067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04077v4","updated":"2024-04-03T20:53:45Z","published":"2023-09-08T02:24:37Z","title":"SayNav: Grounding Large Language Models for Dynamic Planning to\n Navigation in New Environments","summary":" Semantic reasoning and dynamic planning capabilities are crucial for an\nautonomous agent to perform complex navigation tasks in unknown environments.\nIt requires a large amount of common-sense knowledge, that humans possess, to\nsucceed in these tasks. We present SayNav, a new approach that leverages human\nknowledge from Large Language Models (LLMs) for efficient generalization to\ncomplex navigation tasks in unknown large-scale environments. SayNav uses a\nnovel grounding mechanism, that incrementally builds a 3D scene graph of the\nexplored environment as inputs to LLMs, for generating feasible and\ncontextually appropriate high-level plans for navigation. The LLM-generated\nplan is then executed by a pre-trained low-level planner, that treats each\nplanned step as a short-distance point-goal navigation sub-task. SayNav\ndynamically generates step-by-step instructions during navigation and\ncontinuously refines future steps based on newly perceived information. We\nevaluate SayNav on multi-object navigation (MultiON) task, that requires the\nagent to utilize a massive amount of human knowledge to efficiently search\nmultiple different objects in an unknown environment. We also introduce a\nbenchmark dataset for MultiON task employing ProcTHOR framework that provides\nlarge photo-realistic indoor environments with variety of objects. SayNav\nachieves state-of-the-art results and even outperforms an oracle based baseline\nwith strong ground-truth assumptions by more than 8% in terms of success rate,\nhighlighting its ability to generate dynamic plans for successfully locating\nobjects in large-scale new environments. The code, benchmark dataset and\ndemonstration videos are accessible at\nhttps://www.sri.com/ics/computer-vision/saynav.\n","authors":["Abhinav Rajvanshi","Karan Sikka","Xiao Lin","Bhoram Lee","Han-Pang Chiu","Alvaro Velasquez"],"pdf_url":"https://arxiv.org/pdf/2309.04077v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03017v1","updated":"2024-04-03T18:57:54Z","published":"2024-04-03T18:57:54Z","title":"Distributionally Robust Policy and Lyapunov-Certificate Learning","summary":" This article presents novel methods for synthesizing distributionally robust\nstabilizing neural controllers and certificates for control systems under model\nuncertainty. A key challenge in designing controllers with stability guarantees\nfor uncertain systems is the accurate determination of and adaptation to shifts\nin model parametric uncertainty during online deployment. We tackle this with a\nnovel distributionally robust formulation of the Lyapunov derivative chance\nconstraint ensuring a monotonic decrease of the Lyapunov certificate. To avoid\nthe computational complexity involved in dealing with the space of probability\nmeasures, we identify a sufficient condition in the form of deterministic\nconvex constraints that ensures the Lyapunov derivative constraint is\nsatisfied. We integrate this condition into a loss function for training a\nneural network-based controller and show that, for the resulting closed-loop\nsystem, the global asymptotic stability of its equilibrium can be certified\nwith high confidence, even with Out-of-Distribution (OoD) model uncertainties.\nTo demonstrate the efficacy and efficiency of the proposed methodology, we\ncompare it with an uncertainty-agnostic baseline approach and several\nreinforcement learning approaches in two control problems in simulation.\n","authors":["Kehan Long","Jorge Cortes","Nikolay Atanasov"],"pdf_url":"https://arxiv.org/pdf/2404.03017v1.pdf","comment":"Submitted to IEEE Open Journal of Control Systems"},{"id":"http://arxiv.org/abs/1911.12905v4","updated":"2024-04-03T14:35:49Z","published":"2019-11-29T00:08:58Z","title":"Simulation-based reinforcement learning for real-world autonomous\n driving","summary":" We use reinforcement learning in simulation to obtain a driving system\ncontrolling a full-size real-world vehicle. The driving policy takes RGB images\nfrom a single camera and their semantic segmentation as input. We use mostly\nsynthetic data, with labelled real-world data appearing only in the training of\nthe segmentation network.\n Using reinforcement learning in simulation and synthetic data is motivated by\nlowering costs and engineering effort.\n In real-world experiments we confirm that we achieved successful sim-to-real\npolicy transfer. Based on the extensive evaluation, we analyze how design\ndecisions about perception, control, and training impact the real-world\nperformance.\n","authors":["Błażej Osiński","Adam Jakubowski","Piotr Miłoś","Paweł Zięcina","Christopher Galias","Silviu Homoceanu","Henryk Michalewski"],"pdf_url":"https://arxiv.org/pdf/1911.12905v4.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.02905v1","updated":"2024-04-03T17:59:53Z","published":"2024-04-03T17:59:53Z","title":"Visual Autoregressive Modeling: Scalable Image Generation via Next-Scale\n Prediction","summary":" We present Visual AutoRegressive modeling (VAR), a new generation paradigm\nthat redefines the autoregressive learning on images as coarse-to-fine\n\"next-scale prediction\" or \"next-resolution prediction\", diverging from the\nstandard raster-scan \"next-token prediction\". This simple, intuitive\nmethodology allows autoregressive (AR) transformers to learn visual\ndistributions fast and generalize well: VAR, for the first time, makes AR\nmodels surpass diffusion transformers in image generation. On ImageNet 256x256\nbenchmark, VAR significantly improve AR baseline by improving Frechet inception\ndistance (FID) from 18.65 to 1.80, inception score (IS) from 80.4 to 356.4,\nwith around 20x faster inference speed. It is also empirically verified that\nVAR outperforms the Diffusion Transformer (DiT) in multiple dimensions\nincluding image quality, inference speed, data efficiency, and scalability.\nScaling up VAR models exhibits clear power-law scaling laws similar to those\nobserved in LLMs, with linear correlation coefficients near -0.998 as solid\nevidence. VAR further showcases zero-shot generalization ability in downstream\ntasks including image in-painting, out-painting, and editing. These results\nsuggest VAR has initially emulated the two important properties of LLMs:\nScaling Laws and zero-shot task generalization. We have released all models and\ncodes to promote the exploration of AR/VAR models for visual generation and\nunified learning.\n","authors":["Keyu Tian","Yi Jiang","Zehuan Yuan","Bingyue Peng","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02904v1","updated":"2024-04-03T17:59:36Z","published":"2024-04-03T17:59:36Z","title":"ALOHa: A New Measure for Hallucination in Captioning Models","summary":" Despite recent advances in multimodal pre-training for visual description,\nstate-of-the-art models still produce captions containing errors, such as\nhallucinating objects not present in a scene. The existing prominent metric for\nobject hallucination, CHAIR, is limited to a fixed set of MS COCO objects and\nsynonyms. In this work, we propose a modernized open-vocabulary metric, ALOHa,\nwhich leverages large language models (LLMs) to measure object hallucinations.\nSpecifically, we use an LLM to extract groundable objects from a candidate\ncaption, measure their semantic similarity to reference objects from captions\nand object detections, and use Hungarian matching to produce a final\nhallucination score. We show that ALOHa correctly identifies 13.6% more\nhallucinated objects than CHAIR on HAT, a new gold-standard subset of MS COCO\nCaptions annotated for hallucinations, and 30.8% more on nocaps, where objects\nextend beyond MS COCO categories. Our code is available at\nhttps://davidmchan.github.io/aloha/.\n","authors":["Suzanne Petryk","David M. Chan","Anish Kachinthaya","Haodi Zou","John Canny","Joseph E. Gonzalez","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2404.02904v1.pdf","comment":"To appear at NAACL 2024"},{"id":"http://arxiv.org/abs/2404.02903v1","updated":"2024-04-03T17:59:28Z","published":"2024-04-03T17:59:28Z","title":"LidarDM: Generative LiDAR Simulation in a Generated World","summary":" We present LidarDM, a novel LiDAR generative model capable of producing\nrealistic, layout-aware, physically plausible, and temporally coherent LiDAR\nvideos. LidarDM stands out with two unprecedented capabilities in LiDAR\ngenerative modeling: (i) LiDAR generation guided by driving scenarios, offering\nsignificant potential for autonomous driving simulations, and (ii) 4D LiDAR\npoint cloud generation, enabling the creation of realistic and temporally\ncoherent sequences. At the heart of our model is a novel integrated 4D world\ngeneration framework. Specifically, we employ latent diffusion models to\ngenerate the 3D scene, combine it with dynamic actors to form the underlying 4D\nworld, and subsequently produce realistic sensory observations within this\nvirtual environment. Our experiments indicate that our approach outperforms\ncompeting algorithms in realism, temporal coherency, and layout consistency. We\nadditionally show that LidarDM can be used as a generative world model\nsimulator for training and testing perception models.\n","authors":["Vlas Zyrianov","Henry Che","Zhijian Liu","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02900v1","updated":"2024-04-03T17:58:21Z","published":"2024-04-03T17:58:21Z","title":"DeiT-LT Distillation Strikes Back for Vision Transformer Training on\n Long-Tailed Datasets","summary":" Vision Transformer (ViT) has emerged as a prominent architecture for various\ncomputer vision tasks. In ViT, we divide the input image into patch tokens and\nprocess them through a stack of self attention blocks. However, unlike\nConvolutional Neural Networks (CNN), ViTs simple architecture has no\ninformative inductive bias (e.g., locality,etc. ). Due to this, ViT requires a\nlarge amount of data for pre-training. Various data efficient approaches (DeiT)\nhave been proposed to train ViT on balanced datasets effectively. However,\nlimited literature discusses the use of ViT for datasets with long-tailed\nimbalances. In this work, we introduce DeiT-LT to tackle the problem of\ntraining ViTs from scratch on long-tailed datasets. In DeiT-LT, we introduce an\nefficient and effective way of distillation from CNN via distillation DIST\ntoken by using out-of-distribution images and re-weighting the distillation\nloss to enhance focus on tail classes. This leads to the learning of local\nCNN-like features in early ViT blocks, improving generalization for tail\nclasses. Further, to mitigate overfitting, we propose distilling from a flat\nCNN teacher, which leads to learning low-rank generalizable features for DIST\ntokens across all ViT blocks. With the proposed DeiT-LT scheme, the\ndistillation DIST token becomes an expert on the tail classes, and the\nclassifier CLS token becomes an expert on the head classes. The experts help to\neffectively learn features corresponding to both the majority and minority\nclasses using a distinct set of tokens within the same ViT architecture. We\nshow the effectiveness of DeiT-LT for training ViT from scratch on datasets\nranging from small-scale CIFAR-10 LT to large-scale iNaturalist-2018.\n","authors":["Harsh Rangwani","Pradipto Mondal","Mayank Mishra","Ashish Ramayee Asokan","R. Venkatesh Babu"],"pdf_url":"https://arxiv.org/pdf/2404.02900v1.pdf","comment":"CVPR 2024. Project Page: https://rangwani-harsh.github.io/DeiT-LT"},{"id":"http://arxiv.org/abs/2312.00947v2","updated":"2024-04-03T17:58:13Z","published":"2023-12-01T22:00:14Z","title":"FreeZe: Training-free zero-shot 6D pose estimation with geometric and\n vision foundation models","summary":" Estimating the 6D pose of objects unseen during training is highly desirable\nyet challenging. Zero-shot object 6D pose estimation methods address this\nchallenge by leveraging additional task-specific supervision provided by\nlarge-scale, photo-realistic synthetic datasets. However, their performance\nheavily depends on the quality and diversity of rendered data and they require\nextensive training. In this work, we show how to tackle the same task but\nwithout training on specific data. We propose FreeZe, a novel solution that\nharnesses the capabilities of pre-trained geometric and vision foundation\nmodels. FreeZe leverages 3D geometric descriptors learned from unrelated 3D\npoint clouds and 2D visual features learned from web-scale 2D images to\ngenerate discriminative 3D point-level descriptors. We then estimate the 6D\npose of unseen objects by 3D registration based on RANSAC. We also introduce a\nnovel algorithm to solve ambiguous cases due to geometrically symmetric objects\nthat is based on visual features. We comprehensively evaluate FreeZe across the\nseven core datasets of the BOP Benchmark, which include over a hundred 3D\nobjects and 20,000 images captured in various scenarios. FreeZe consistently\noutperforms all state-of-the-art approaches, including competitors extensively\ntrained on synthetic 6D pose estimation data. Code will be publicly available\nat https://andreacaraffa.github.io/freeze.\n","authors":["Andrea Caraffa","Davide Boscaini","Amir Hamza","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2312.00947v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02899v1","updated":"2024-04-03T17:57:15Z","published":"2024-04-03T17:57:15Z","title":"MatAtlas: Text-driven Consistent Geometry Texturing and Material\n Assignment","summary":" We present MatAtlas, a method for consistent text-guided 3D model texturing.\nFollowing recent progress we leverage a large scale text-to-image generation\nmodel (e.g., Stable Diffusion) as a prior to texture a 3D model. We carefully\ndesign an RGB texturing pipeline that leverages a grid pattern diffusion,\ndriven by depth and edges. By proposing a multi-step texture refinement\nprocess, we significantly improve the quality and 3D consistency of the\ntexturing output. To further address the problem of baked-in lighting, we move\nbeyond RGB colors and pursue assigning parametric materials to the assets.\nGiven the high-quality initial RGB texture, we propose a novel material\nretrieval method capitalized on Large Language Models (LLM), enabling\neditabiliy and relightability. We evaluate our method on a wide variety of\ngeometries and show that our method significantly outperform prior arts. We\nalso analyze the role of each component through a detailed ablation study.\n","authors":["Duygu Ceylan","Valentin Deschaintre","Thibault Groueix","Rosalie Martin","Chun-Hao Huang","Romain Rouffet","Vladimir Kim","Gaëtan Lassagne"],"pdf_url":"https://arxiv.org/pdf/2404.02899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02897v1","updated":"2024-04-03T17:54:37Z","published":"2024-04-03T17:54:37Z","title":"Deep Image Composition Meets Image Forgery","summary":" Image forgery is a topic that has been studied for many years. Before the\nbreakthrough of deep learning, forged images were detected using handcrafted\nfeatures that did not require training. These traditional methods failed to\nperform satisfactorily even on datasets much worse in quality than real-life\nimage manipulations. Advances in deep learning have impacted image forgery\ndetection as much as they have impacted other areas of computer vision and have\nimproved the state of the art. Deep learning models require large amounts of\nlabeled data for training. In the case of image forgery, labeled data at the\npixel level is a very important factor for the models to learn. None of the\nexisting datasets have sufficient size, realism and pixel-level labeling at the\nsame time. This is due to the high cost of producing and labeling quality\nimages. It can take hours for an image editing expert to manipulate just one\nimage. To bridge this gap, we automate data generation using image composition\ntechniques that are very related to image forgery. Unlike other automated data\ngeneration frameworks, we use state of the art image composition deep learning\nmodels to generate spliced images close to the quality of real-life\nmanipulations. Finally, we test the generated dataset on the SOTA image\nmanipulation detection model and show that its prediction performance is lower\ncompared to existing datasets, i.e. we produce realistic images that are more\ndifficult to detect. Dataset will be available at\nhttps://github.com/99eren99/DIS25k .\n","authors":["Eren Tahir","Mert Bal"],"pdf_url":"https://arxiv.org/pdf/2404.02897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02889v1","updated":"2024-04-03T17:44:02Z","published":"2024-04-03T17:44:02Z","title":"Steganographic Passport: An Owner and User Verifiable Credential for\n Deep Model IP Protection Without Retraining","summary":" Ensuring the legal usage of deep models is crucial to promoting trustable,\naccountable, and responsible artificial intelligence innovation. Current\npassport-based methods that obfuscate model functionality for license-to-use\nand ownership verifications suffer from capacity and quality constraints, as\nthey require retraining the owner model for new users. They are also vulnerable\nto advanced Expanded Residual Block ambiguity attacks. We propose\nSteganographic Passport, which uses an invertible steganographic network to\ndecouple license-to-use from ownership verification by hiding the user's\nidentity images into the owner-side passport and recovering them from their\nrespective user-side passports. An irreversible and collision-resistant hash\nfunction is used to avoid exposing the owner-side passport from the derived\nuser-side passports and increase the uniqueness of the model signature. To\nsafeguard both the passport and model's weights against advanced ambiguity\nattacks, an activation-level obfuscation is proposed for the verification\nbranch of the owner's model. By jointly training the verification and\ndeployment branches, their weights become tightly coupled. The proposed method\nsupports agile licensing of deep models by providing a strong ownership proof\nand license accountability without requiring a separate model retraining for\nthe admission of every new user. Experiment results show that our\nSteganographic Passport outperforms other passport-based deep model protection\nmethods in robustness against various known attacks.\n","authors":["Qi Cui","Ruohan Meng","Chaohui Xu","Chip-Hong Chang"],"pdf_url":"https://arxiv.org/pdf/2404.02889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14740v2","updated":"2024-04-03T17:42:44Z","published":"2023-08-28T17:41:14Z","title":"Total Selfie: Generating Full-Body Selfies","summary":" We present a method to generate full-body selfies from photographs originally\ntaken at arms length. Because self-captured photos are typically taken close\nup, they have limited field of view and exaggerated perspective that distorts\nfacial shapes. We instead seek to generate the photo some one else would take\nof you from a few feet away. Our approach takes as input four selfies of your\nface and body, a background image, and generates a full-body selfie in a\ndesired target pose. We introduce a novel diffusion-based approach to combine\nall of this information into high-quality, well-composed photos of you with the\ndesired pose and background.\n","authors":["Bowei Chen","Brian Curless","Ira Kemelmacher-Shlizerman","Steven M. Seitz"],"pdf_url":"https://arxiv.org/pdf/2308.14740v2.pdf","comment":"Project page:\n https://homes.cs.washington.edu/~boweiche/project_page/totalselfie/"},{"id":"http://arxiv.org/abs/2403.00939v3","updated":"2024-04-03T17:42:11Z","published":"2024-03-01T19:36:11Z","title":"G3DR: Generative 3D Reconstruction in ImageNet","summary":" We introduce a novel 3D generative method, Generative 3D Reconstruction\n(G3DR) in ImageNet, capable of generating diverse and high-quality 3D objects\nfrom single images, addressing the limitations of existing methods. At the\nheart of our framework is a novel depth regularization technique that enables\nthe generation of scenes with high-geometric fidelity. G3DR also leverages a\npretrained language-vision model, such as CLIP, to enable reconstruction in\nnovel views and improve the visual realism of generations. Additionally, G3DR\ndesigns a simple but effective sampling procedure to further improve the\nquality of generations. G3DR offers diverse and efficient 3D asset generation\nbased on class or text conditioning. Despite its simplicity, G3DR is able to\nbeat state-of-theart methods, improving over them by up to 22% in perceptual\nmetrics and 90% in geometry scores, while needing only half of the training\ntime. Code is available at https://github.com/preddy5/G3DR\n","authors":["Pradyumna Reddy","Ismail Elezi","Jiankang Deng"],"pdf_url":"https://arxiv.org/pdf/2403.00939v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02885v1","updated":"2024-04-03T17:38:15Z","published":"2024-04-03T17:38:15Z","title":"PoCo: Point Context Cluster for RGBD Indoor Place Recognition","summary":" We present a novel end-to-end algorithm (PoCo) for the indoor RGB-D place\nrecognition task, aimed at identifying the most likely match for a given query\nframe within a reference database. The task presents inherent challenges\nattributed to the constrained field of view and limited range of perception\nsensors. We propose a new network architecture, which generalizes the recent\nContext of Clusters (CoCs) to extract global descriptors directly from the\nnoisy point clouds through end-to-end learning. Moreover, we develop the\narchitecture by integrating both color and geometric modalities into the point\nfeatures to enhance the global descriptor representation. We conducted\nevaluations on public datasets ScanNet-PR and ARKit with 807 and 5047\nscenarios, respectively. PoCo achieves SOTA performance: on ScanNet-PR, we\nachieve R@1 of 64.63%, a 5.7% improvement from the best-published result CGis\n(61.12%); on Arkit, we achieve R@1 of 45.12%, a 13.3% improvement from the\nbest-published result CGis (39.82%). In addition, PoCo shows higher efficiency\nthan CGis in inference time (1.75X-faster), and we demonstrate the\neffectiveness of PoCo in recognizing places within a real-world laboratory\nenvironment.\n","authors":["Jing Liang","Zhuo Deng","Zheming Zhou","Omid Ghasemalizadeh","Dinesh Manocha","Min Sun","Cheng-Hao Kuo","Arnie Sen"],"pdf_url":"https://arxiv.org/pdf/2404.02885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02883v1","updated":"2024-04-03T17:34:28Z","published":"2024-04-03T17:34:28Z","title":"On the Scalability of Diffusion-based Text-to-Image Generation","summary":" Scaling up model and data size has been quite successful for the evolution of\nLLMs. However, the scaling law for the diffusion based text-to-image (T2I)\nmodels is not fully explored. It is also unclear how to efficiently scale the\nmodel for better performance at reduced cost. The different training settings\nand expensive training cost make a fair model comparison extremely difficult.\nIn this work, we empirically study the scaling properties of diffusion based\nT2I models by performing extensive and rigours ablations on scaling both\ndenoising backbones and training set, including training scaled UNet and\nTransformer variants ranging from 0.4B to 4B parameters on datasets upto 600M\nimages. For model scaling, we find the location and amount of cross attention\ndistinguishes the performance of existing UNet designs. And increasing the\ntransformer blocks is more parameter-efficient for improving text-image\nalignment than increasing channel numbers. We then identify an efficient UNet\nvariant, which is 45% smaller and 28% faster than SDXL's UNet. On the data\nscaling side, we show the quality and diversity of the training set matters\nmore than simply dataset size. Increasing caption density and diversity\nimproves text-image alignment performance and the learning efficiency. Finally,\nwe provide scaling functions to predict the text-image alignment performance as\nfunctions of the scale of model size, compute and dataset size.\n","authors":["Hao Li","Yang Zou","Ying Wang","Orchid Majumder","Yusheng Xie","R. Manmatha","Ashwin Swaminathan","Zhuowen Tu","Stefano Ermon","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2404.02883v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.02877v1","updated":"2024-04-03T17:24:27Z","published":"2024-04-03T17:24:27Z","title":"FlightScope: A Deep Comprehensive Assessment of Aircraft Detection\n Algorithms in Satellite Imagery","summary":" Object detection in remotely sensed satellite pictures is fundamental in many\nfields such as biophysical, and environmental monitoring. While deep learning\nalgorithms are constantly evolving, they have been mostly implemented and\ntested on popular ground-based taken photos. This paper critically evaluates\nand compares a suite of advanced object detection algorithms customized for the\ntask of identifying aircraft within satellite imagery. Using the large\nHRPlanesV2 dataset, together with a rigorous validation with the GDIT dataset,\nthis research encompasses an array of methodologies including YOLO versions 5\nand 8, Faster RCNN, CenterNet, RetinaNet, RTMDet, and DETR, all trained from\nscratch. This exhaustive training and validation study reveal YOLOv5 as the\npreeminent model for the specific case of identifying airplanes from remote\nsensing data, showcasing high precision and adaptability across diverse imaging\nconditions. This research highlight the nuanced performance landscapes of these\nalgorithms, with YOLOv5 emerging as a robust solution for aerial object\ndetection, underlining its importance through superior mean average precision,\nRecall, and Intersection over Union scores. The findings described here\nunderscore the fundamental role of algorithm selection aligned with the\nspecific demands of satellite imagery analysis and extend a comprehensive\nframework to evaluate model efficacy. The benchmark toolkit and codes,\navailable via https://github.com/toelt-llc/FlightScope_Bench, aims to further\nexploration and innovation in the realm of remote sensing object detection,\npaving the way for improved analytical methodologies in satellite imagery\napplications.\n","authors":["Safouane El Ghazouali","Arnaud Gucciardi","Nicola Venturi","Michael Rueegsegger","Umberto Michelucci"],"pdf_url":"https://arxiv.org/pdf/2404.02877v1.pdf","comment":"15 figures, 4 tables, comprehensive survey, comparative study"},{"id":"http://arxiv.org/abs/2403.18346v3","updated":"2024-04-03T17:18:51Z","published":"2024-03-27T08:38:49Z","title":"Quantifying and Mitigating Unimodal Biases in Multimodal Large Language\n Models: A Causal Perspective","summary":" Recent advancements in Large Language Models (LLMs) have facilitated the\ndevelopment of Multimodal LLMs (MLLMs). Despite their impressive capabilities,\nMLLMs often suffer from an over-reliance on unimodal biases (e.g., language\nbias and vision bias), leading to incorrect answers in complex multimodal\ntasks. To investigate this issue, we propose a causal framework to interpret\nthe biases in Visual Question Answering (VQA) problems. Within our framework,\nwe devise a causal graph to elucidate the predictions of MLLMs on VQA problems,\nand assess the causal effect of biases through an in-depth causal analysis.\nMotivated by the causal graph, we introduce a novel MORE dataset, consisting of\n12,000 VQA instances. This dataset is designed to challenge MLLMs' abilities,\nnecessitating multi-hop reasoning and the surmounting of unimodal biases.\nFurthermore, we propose two strategies to mitigate unimodal biases and enhance\nMLLMs' reasoning capabilities, including a Decompose-Verify-Answer (DeVA)\nframework for limited-access MLLMs and the refinement of open-source MLLMs\nthrough fine-tuning. Extensive quantitative and qualitative experiments offer\nvaluable insights for future research. Our project page is at\nhttps://opencausalab.github.io/MORE.\n","authors":["Meiqi Chen","Yixin Cao","Yan Zhang","Chaochao Lu"],"pdf_url":"https://arxiv.org/pdf/2403.18346v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11782v2","updated":"2024-04-03T16:57:35Z","published":"2023-12-19T01:33:46Z","title":"Learning Object State Changes in Videos: An Open-World Perspective","summary":" Object State Changes (OSCs) are pivotal for video understanding. While humans\ncan effortlessly generalize OSC understanding from familiar to unknown objects,\ncurrent approaches are confined to a closed vocabulary. Addressing this gap, we\nintroduce a novel open-world formulation for the video OSC problem. The goal is\nto temporally localize the three stages of an OSC -- the object's initial\nstate, its transitioning state, and its end state -- whether or not the object\nhas been observed during training. Towards this end, we develop VidOSC, a\nholistic learning approach that: (1) leverages text and vision-language models\nfor supervisory signals to obviate manually labeling OSC training data, and (2)\nabstracts fine-grained shared state representations from objects to enhance\ngeneralization. Furthermore, we present HowToChange, the first open-world\nbenchmark for video OSC localization, which offers an order of magnitude\nincrease in the label space and annotation volume compared to the best existing\nbenchmark. Experimental results demonstrate the efficacy of our approach, in\nboth traditional closed-world and open-world scenarios.\n","authors":["Zihui Xue","Kumar Ashutosh","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2312.11782v2.pdf","comment":"Accepted by CVPR 2024, Project website:\n https://vision.cs.utexas.edu/projects/VidOSC/"},{"id":"http://arxiv.org/abs/2404.01717v2","updated":"2024-04-03T16:46:27Z","published":"2024-04-02T08:07:38Z","title":"AddSR: Accelerating Diffusion-based Blind Super-Resolution with\n Adversarial Diffusion Distillation","summary":" Blind super-resolution methods based on stable diffusion showcase formidable\ngenerative capabilities in reconstructing clear high-resolution images with\nintricate details from low-resolution inputs. However, their practical\napplicability is often hampered by poor efficiency, stemming from the\nrequirement of thousands or hundreds of sampling steps. Inspired by the\nefficient text-to-image approach adversarial diffusion distillation (ADD), we\ndesign AddSR to address this issue by incorporating the ideas of both\ndistillation and ControlNet. Specifically, we first propose a prediction-based\nself-refinement strategy to provide high-frequency information in the student\nmodel output with marginal additional time cost. Furthermore, we refine the\ntraining process by employing HR images, rather than LR images, to regulate the\nteacher model, providing a more robust constraint for distillation. Second, we\nintroduce a timestep-adapting loss to address the perception-distortion\nimbalance problem introduced by ADD. Extensive experiments demonstrate our\nAddSR generates better restoration results, while achieving faster speed than\nprevious SD-based state-of-the-art models (e.g., 7x faster than SeeSR).\n","authors":["Rui Xie","Ying Tai","Kai Zhang","Zhenyu Zhang","Jun Zhou","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01717v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02845v1","updated":"2024-04-03T16:23:37Z","published":"2024-04-03T16:23:37Z","title":"Cross-Modal Conditioned Reconstruction for Language-guided Medical Image\n Segmentation","summary":" Recent developments underscore the potential of textual information in\nenhancing learning models for a deeper understanding of medical visual\nsemantics. However, language-guided medical image segmentation still faces a\nchallenging issue. Previous works employ implicit and ambiguous architectures\nto embed textual information. This leads to segmentation results that are\ninconsistent with the semantics represented by the language, sometimes even\ndiverging significantly. To this end, we propose a novel cross-modal\nconditioned Reconstruction for Language-guided Medical Image Segmentation\n(RecLMIS) to explicitly capture cross-modal interactions, which assumes that\nwell-aligned medical visual features and medical notes can effectively\nreconstruct each other. We introduce conditioned interaction to adaptively\npredict patches and words of interest. Subsequently, they are utilized as\nconditioning factors for mutual reconstruction to align with regions described\nin the medical notes. Extensive experiments demonstrate the superiority of our\nRecLMIS, surpassing LViT by 3.74% mIoU on the publicly available MosMedData+\ndataset and achieving an average increase of 1.89% mIoU for cross-domain tests\non our QATA-CoV19 dataset. Simultaneously, we achieve a relative reduction of\n20.2% in parameter count and a 55.5% decrease in computational load. The code\nwill be available at https://github.com/ShashankHuang/RecLMIS.\n","authors":["Xiaoshuang Huang","Hongxiang Li","Meng Cao","Long Chen","Chenyu You","Dong An"],"pdf_url":"https://arxiv.org/pdf/2404.02845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02830v1","updated":"2024-04-03T16:04:59Z","published":"2024-04-03T16:04:59Z","title":"Enhancing Interpretability of Vertebrae Fracture Grading using\n Human-interpretable Prototypes","summary":" Vertebral fracture grading classifies the severity of vertebral fractures,\nwhich is a challenging task in medical imaging and has recently attracted Deep\nLearning (DL) models. Only a few works attempted to make such models\nhuman-interpretable despite the need for transparency and trustworthiness in\ncritical use cases like DL-assisted medical diagnosis. Moreover, such models\neither rely on post-hoc methods or additional annotations. In this work, we\npropose a novel interpretable-by-design method, ProtoVerse, to find relevant\nsub-parts of vertebral fractures (prototypes) that reliably explain the model's\ndecision in a human-understandable way. Specifically, we introduce a novel\ndiversity-promoting loss to mitigate prototype repetitions in small datasets\nwith intricate semantics. We have experimented with the VerSe'19 dataset and\noutperformed the existing prototype-based method. Further, our model provides\nsuperior interpretability against the post-hoc method. Importantly, expert\nradiologists validated the visual interpretability of our results, showing\nclinical applicability.\n","authors":["Poulami Sinhamahapatra","Suprosanna Shit","Anjany Sekuboyina","Malek Husseini","David Schinz","Nicolas Lenhart","Joern Menze","Jan Kirschke","Karsten Roscher","Stephan Guennemann"],"pdf_url":"https://arxiv.org/pdf/2404.02830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10835v3","updated":"2024-04-03T16:00:18Z","published":"2023-12-17T22:40:38Z","title":"Your Student is Better Than Expected: Adaptive Teacher-Student\n Collaboration for Text-Conditional Diffusion Models","summary":" Knowledge distillation methods have recently shown to be a promising\ndirection to speedup the synthesis of large-scale diffusion models by requiring\nonly a few inference steps. While several powerful distillation methods were\nrecently proposed, the overall quality of student samples is typically lower\ncompared to the teacher ones, which hinders their practical usage. In this\nwork, we investigate the relative quality of samples produced by the teacher\ntext-to-image diffusion model and its distilled student version. As our main\nempirical finding, we discover that a noticeable portion of student samples\nexhibit superior fidelity compared to the teacher ones, despite the\n\"approximate\" nature of the student. Based on this finding, we propose an\nadaptive collaboration between student and teacher diffusion models for\neffective text-to-image synthesis. Specifically, the distilled model produces\nthe initial sample, and then an oracle decides whether it needs further\nimprovements with a slow teacher model. Extensive experiments demonstrate that\nthe designed pipeline surpasses state-of-the-art text-to-image alternatives for\nvarious inference budgets in terms of human preference. Furthermore, the\nproposed approach can be naturally used in popular applications such as\ntext-guided image editing and controllable generation.\n","authors":["Nikita Starodubcev","Artem Fedorov","Artem Babenko","Dmitry Baranchuk"],"pdf_url":"https://arxiv.org/pdf/2312.10835v3.pdf","comment":"CVPR2024 camera ready"},{"id":"http://arxiv.org/abs/2312.10389v2","updated":"2024-04-03T15:54:15Z","published":"2023-12-16T09:04:44Z","title":"ElasticLaneNet: An Efficient Geometry-Flexible Approach for Lane\n Detection","summary":" The task of lane detection involves identifying the boundaries of driving\nareas in real-time. Recognizing lanes with variable and complex geometric\nstructures remains a challenge. In this paper, we explore a novel and flexible\nway of implicit lanes representation named \\textit{Elastic Lane map (ELM)}, and\nintroduce an efficient physics-informed end-to-end lane detection framework,\nnamely, ElasticLaneNet (Elastic interaction energy-informed Lane detection\nNetwork). The approach considers predicted lanes as moving zero-contours on the\nflexibly shaped \\textit{ELM} that are attracted to the ground truth guided by\nan elastic interaction energy-loss function (EIE loss). Our framework well\nintegrates the global information and low-level features. The method performs\nwell in complex lane scenarios, including those with large curvature, weak\ngeometry features at intersections, complicated cross lanes, Y-shapes lanes,\ndense lanes, etc. We apply our approach on three datasets: SDLane, CULane, and\nTuSimple. The results demonstrate exceptional performance of our method, with\nthe state-of-the-art results on the structurally diverse SDLane, achieving\nF1-score of 89.51, Recall rate of 87.50, and Precision of 91.61 with fast\ninference speed.\n","authors":["Yaxin Feng","Yuan Lan","Luchan Zhang","Yang Xiang"],"pdf_url":"https://arxiv.org/pdf/2312.10389v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05247v2","updated":"2024-04-03T15:40:00Z","published":"2023-12-08T18:55:24Z","title":"Dynamic LiDAR Re-simulation using Compositional Neural Fields","summary":" We introduce DyNFL, a novel neural field-based approach for high-fidelity\nre-simulation of LiDAR scans in dynamic driving scenes. DyNFL processes LiDAR\nmeasurements from dynamic environments, accompanied by bounding boxes of moving\nobjects, to construct an editable neural field. This field, comprising\nseparately reconstructed static background and dynamic objects, allows users to\nmodify viewpoints, adjust object positions, and seamlessly add or remove\nobjects in the re-simulated scene. A key innovation of our method is the neural\nfield composition technique, which effectively integrates reconstructed neural\nassets from various scenes through a ray drop test, accounting for occlusions\nand transparent surfaces. Our evaluation with both synthetic and real-world\nenvironments demonstrates that DyNFL substantially improves dynamic scene LiDAR\nsimulation, offering a combination of physical fidelity and flexible editing\ncapabilities.\n","authors":["Hanfeng Wu","Xingxing Zuo","Stefan Leutenegger","Or Litany","Konrad Schindler","Shengyu Huang"],"pdf_url":"https://arxiv.org/pdf/2312.05247v2.pdf","comment":"Project page: https://shengyuh.github.io/dynfl"},{"id":"http://arxiv.org/abs/2312.15702v2","updated":"2024-04-03T15:38:12Z","published":"2023-12-25T11:54:07Z","title":"Three Heads Are Better Than One: Complementary Experts for Long-Tailed\n Semi-supervised Learning","summary":" We address the challenging problem of Long-Tailed Semi-Supervised Learning\n(LTSSL) where labeled data exhibit imbalanced class distribution and unlabeled\ndata follow an unknown distribution. Unlike in balanced SSL, the generated\npseudo-labels are skewed towards head classes, intensifying the training bias.\nSuch a phenomenon is even amplified as more unlabeled data will be mislabeled\nas head classes when the class distribution of labeled and unlabeled datasets\nare mismatched. To solve this problem, we propose a novel method named\nComPlementary Experts (CPE). Specifically, we train multiple experts to model\nvarious class distributions, each of them yielding high-quality pseudo-labels\nwithin one form of class distribution. Besides, we introduce Classwise Batch\nNormalization for CPE to avoid performance degradation caused by feature\ndistribution mismatch between head and non-head classes. CPE achieves\nstate-of-the-art performances on CIFAR-10-LT, CIFAR-100-LT, and STL-10-LT\ndataset benchmarks. For instance, on CIFAR-10-LT, CPE improves test accuracy by\nover 2.22% compared to baselines. Code is available at\nhttps://github.com/machengcheng2016/CPE-LTSSL.\n","authors":["Chengcheng Ma","Ismail Elezi","Jiankang Deng","Weiming Dong","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2312.15702v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2404.02813v1","updated":"2024-04-03T15:37:02Z","published":"2024-04-03T15:37:02Z","title":"GPU-Accelerated RSF Level Set Evolution for Large-Scale Microvascular\n Segmentation","summary":" Microvascular networks are challenging to model because these structures are\ncurrently near the diffraction limit for most advanced three-dimensional\nimaging modalities, including confocal and light sheet microscopy. This makes\nsemantic segmentation difficult, because individual components of these\nnetworks fluctuate within the confines of individual pixels. Level set methods\nare ideally suited to solve this problem by providing surface and topological\nconstraints on the resulting model, however these active contour techniques are\nextremely time intensive and impractical for terabyte-scale images. We propose\na reformulation and implementation of the region-scalable fitting (RSF) level\nset model that makes it amenable to three-dimensional evaluation using both\nsingle-instruction multiple data (SIMD) and single-program multiple-data (SPMD)\nparallel processing. This enables evaluation of the level set equation on\nindependent regions of the data set using graphics processing units (GPUs),\nmaking large-scale segmentation of high-resolution networks practical and\ninexpensive.\n We tested this 3D parallel RSF approach on multiple data sets acquired using\nstate-of-the-art imaging techniques to acquire microvascular data, including\nmicro-CT, light sheet fluorescence microscopy (LSFM) and milling microscopy. To\nassess the performance and accuracy of the RSF model, we conducted a\nMonte-Carlo-based validation technique to compare results to other segmentation\nmethods. We also provide a rigorous profiling to show the gains in processing\nspeed leveraging parallel hardware. This study showcases the practical\napplication of the RSF model, emphasizing its utility in the challenging domain\nof segmenting large-scale high-topology network structures with a particular\nfocus on building microvascular models.\n","authors":["Meher Niger","Helya Goharbavang","Taeyong Ahn","Emily K. Alley","Joshua D. Wythe","Guoning Chen","David Mayerich"],"pdf_url":"https://arxiv.org/pdf/2404.02813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01449v2","updated":"2024-04-03T15:32:17Z","published":"2023-10-02T01:30:42Z","title":"Elastic Interaction Energy-Informed Real-Time Traffic Scene Perception","summary":" Urban segmentation and lane detection are two important tasks for traffic\nscene perception. Accuracy and fast inference speed of visual perception are\ncrucial for autonomous driving safety. Fine and complex geometric objects are\nthe most challenging but important recognition targets in traffic scene, such\nas pedestrians, traffic signs and lanes. In this paper, a simple and efficient\ntopology-aware energy loss function-based network training strategy named\nEIEGSeg is proposed. EIEGSeg is designed for multi-class segmentation on\nreal-time traffic scene perception. To be specific, the convolutional neural\nnetwork (CNN) extracts image features and produces multiple outputs, and the\nelastic interaction energy loss function (EIEL) drives the predictions moving\ntoward the ground truth until they are completely overlapped. Our strategy\nperforms well especially on fine-scale structure, \\textit{i.e.} small or\nirregularly shaped objects can be identified more accurately, and discontinuity\nissues on slender objects can be improved. We quantitatively and qualitatively\nanalyze our method on three traffic datasets, including urban scene\nsegmentation data Cityscapes and lane detection data TuSimple and CULane. Our\nresults demonstrate that EIEGSeg consistently improves the performance,\nespecially on real-time, lightweight networks that are better suited for\nautonomous driving.\n","authors":["Yaxin Feng","Yuan Lan","Luchan Zhang","Guoqing Liu","Yang Xiang"],"pdf_url":"https://arxiv.org/pdf/2310.01449v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08730v2","updated":"2024-04-03T15:22:23Z","published":"2024-03-13T17:29:45Z","title":"Strengthening Multimodal Large Language Model with Bootstrapped\n Preference Optimization","summary":" Multimodal Large Language Models (MLLMs) excel in generating responses based\non visual inputs. However, they often suffer from a bias towards generating\nresponses similar to their pretraining corpus, overshadowing the importance of\nvisual information. We treat this bias as a \"preference\" for pretraining\nstatistics, which hinders the model's grounding in visual input. To mitigate\nthis issue, we propose Bootstrapped Preference Optimization (BPO), which\nconducts preference learning with datasets containing negative responses\nbootstrapped from the model itself. Specifically, we propose the following two\nstrategies: 1) using distorted image inputs to the MLLM for eliciting responses\nthat contain signified pretraining bias; 2) leveraging text-based LLM to\nexplicitly inject erroneous but common elements into the original response.\nThose undesirable responses are paired with original annotated responses from\nthe datasets to construct the preference dataset, which is subsequently\nutilized to perform preference learning. Our approach effectively suppresses\npretrained LLM bias, enabling enhanced grounding in visual inputs. Extensive\nexperimentation demonstrates significant performance improvements across\nmultiple benchmarks, advancing the state-of-the-art in multimodal\nconversational systems.\n","authors":["Renjie Pi","Tianyang Han","Wei Xiong","Jipeng Zhang","Runtao Liu","Rui Pan","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.08730v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.02689v2","updated":"2024-04-03T15:18:00Z","published":"2020-12-04T15:58:34Z","title":"Isometric Multi-Shape Matching","summary":" Finding correspondences between shapes is a fundamental problem in computer\nvision and graphics, which is relevant for many applications, including 3D\nreconstruction, object tracking, and style transfer. The vast majority of\ncorrespondence methods aim to find a solution between pairs of shapes, even if\nmultiple instances of the same class are available. While isometries are often\nstudied in shape correspondence problems, they have not been considered\nexplicitly in the multi-matching setting. This paper closes this gap by\nproposing a novel optimisation formulation for isometric multi-shape matching.\nWe present a suitable optimisation algorithm for solving our formulation and\nprovide a convergence and complexity analysis. Our algorithm obtains\nmulti-matchings that are by construction provably cycle-consistent. We\ndemonstrate the superior performance of our method on various datasets and set\nthe new state-of-the-art in isometric multi-shape matching.\n","authors":["Maolin Gao","Zorah Lähner","Johan Thunberg","Daniel Cremers","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2012.02689v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07169v3","updated":"2024-04-03T15:11:33Z","published":"2023-12-12T11:13:17Z","title":"Semi-supervised Active Learning for Video Action Detection","summary":" In this work, we focus on label efficient learning for video action\ndetection. We develop a novel semi-supervised active learning approach which\nutilizes both labeled as well as unlabeled data along with informative sample\nselection for action detection. Video action detection requires spatio-temporal\nlocalization along with classification, which poses several challenges for both\nactive learning informative sample selection as well as semi-supervised\nlearning pseudo label generation. First, we propose NoiseAug, a simple\naugmentation strategy which effectively selects informative samples for video\naction detection. Next, we propose fft-attention, a novel technique based on\nhigh-pass filtering which enables effective utilization of pseudo label for SSL\nin video action detection by emphasizing on relevant activity region within a\nvideo. We evaluate the proposed approach on three different benchmark datasets,\nUCF-101-24, JHMDB-21, and Youtube-VOS. First, we demonstrate its effectiveness\non video action detection where the proposed approach outperforms prior works\nin semi-supervised and weakly-supervised learning along with several baseline\napproaches in both UCF101-24 and JHMDB-21. Next, we also show its effectiveness\non Youtube-VOS for video object segmentation demonstrating its generalization\ncapability for other dense prediction tasks in videos. The code and models is\npublicly available at:\n\\url{https://github.com/AKASH2907/semi-sup-active-learning}.\n","authors":["Ayush Singh","Aayush J Rana","Akash Kumar","Shruti Vyas","Yogesh Singh Rawat"],"pdf_url":"https://arxiv.org/pdf/2312.07169v3.pdf","comment":"AAAI Conference on Artificial Intelligence, Main Technical Track\n (AAAI), 2024, Code: https://github.com/AKASH2907/semi-sup-active-learning"},{"id":"http://arxiv.org/abs/2311.16432v2","updated":"2024-04-03T15:05:28Z","published":"2023-11-28T02:27:31Z","title":"Text-Driven Image Editing via Learnable Regions","summary":" Language has emerged as a natural interface for image editing. In this paper,\nwe introduce a method for region-based image editing driven by textual prompts,\nwithout the need for user-provided masks or sketches. Specifically, our\napproach leverages an existing pre-trained text-to-image model and introduces a\nbounding box generator to identify the editing regions that are aligned with\nthe textual prompts. We show that this simple approach enables flexible editing\nthat is compatible with current image generation models, and is able to handle\ncomplex prompts featuring multiple objects, complex sentences, or lengthy\nparagraphs. We conduct an extensive user study to compare our method against\nstate-of-the-art methods. The experiments demonstrate the competitive\nperformance of our method in manipulating images with high fidelity and realism\nthat correspond to the provided language descriptions. Our project webpage can\nbe found at: https://yuanze-lin.me/LearnableRegions_page.\n","authors":["Yuanze Lin","Yi-Wen Chen","Yi-Hsuan Tsai","Lu Jiang","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2311.16432v2.pdf","comment":"Accepted to CVPR 2024 Project webpage:\n https://yuanze-lin.me/LearnableRegions_page"},{"id":"http://arxiv.org/abs/2308.08393v2","updated":"2024-04-03T15:04:03Z","published":"2023-08-16T14:25:30Z","title":"SIGMA: Scale-Invariant Global Sparse Shape Matching","summary":" We propose a novel mixed-integer programming (MIP) formulation for generating\nprecise sparse correspondences for highly non-rigid shapes. To this end, we\nintroduce a projected Laplace-Beltrami operator (PLBO) which combines intrinsic\nand extrinsic geometric information to measure the deformation quality induced\nby predicted correspondences. We integrate the PLBO, together with an\norientation-aware regulariser, into a novel MIP formulation that can be solved\nto global optimality for many practical problems. In contrast to previous\nmethods, our approach is provably invariant to rigid transformations and global\nscaling, initialisation-free, has optimality guarantees, and scales to high\nresolution meshes with (empirically observed) linear time. We show\nstate-of-the-art results for sparse non-rigid matching on several challenging\n3D datasets, including data with inconsistent meshing, as well as applications\nin mesh-to-point-cloud matching.\n","authors":["Maolin Gao","Paul Roetzer","Marvin Eisenberger","Zorah Lähner","Michael Moeller","Daniel Cremers","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2308.08393v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2210.01708v3","updated":"2024-04-03T15:01:26Z","published":"2022-10-04T16:08:54Z","title":"Conquering the Communication Constraints to Enable Large Pre-Trained\n Models in Federated Learning","summary":" Federated learning (FL) has emerged as a promising paradigm for enabling the\ncollaborative training of models without centralized access to the raw data on\nlocal devices. In the typical FL paradigm (e.g., FedAvg), model weights are\nsent to and from the server each round to participating clients. Recently, the\nuse of small pre-trained models has been shown effective in federated learning\noptimization and improving convergence. However, recent state-of-the-art\npre-trained models are getting more capable but also have more parameters. In\nconventional FL, sharing the enormous model weights can quickly put a massive\ncommunication burden on the system, especially if more capable models are\nemployed. Can we find a solution to enable those strong and readily-available\npre-trained models in FL to achieve excellent performance while simultaneously\nreducing the communication burden? To this end, we investigate the use of\nparameter-efficient fine-tuning in federated learning and thus introduce a new\nframework: FedPEFT. Specifically, we systemically evaluate the performance of\nFedPEFT across a variety of client stability, data distribution, and\ndifferential privacy settings. By only locally tuning and globally sharing a\nsmall portion of the model weights, significant reductions in the total\ncommunication overhead can be achieved while maintaining competitive or even\nbetter performance in a wide range of federated learning scenarios, providing\ninsight into a new paradigm for practical and effective federated systems.\n","authors":["Guangyu Sun","Umar Khalid","Matias Mendieta","Taojiannan Yang","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2210.01708v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14505v3","updated":"2024-04-03T14:59:08Z","published":"2024-02-22T12:55:01Z","title":"Towards Seamless Adaptation of Pre-trained Models for Visual Place\n Recognition","summary":" Recent studies show that vision models pre-trained in generic visual learning\ntasks with large-scale data can provide useful feature representations for a\nwide range of visual perception problems. However, few attempts have been made\nto exploit pre-trained foundation models in visual place recognition (VPR). Due\nto the inherent difference in training objectives and data between the tasks of\nmodel pre-training and VPR, how to bridge the gap and fully unleash the\ncapability of pre-trained models for VPR is still a key issue to address. To\nthis end, we propose a novel method to realize seamless adaptation of\npre-trained models for VPR. Specifically, to obtain both global and local\nfeatures that focus on salient landmarks for discriminating places, we design a\nhybrid adaptation method to achieve both global and local adaptation\nefficiently, in which only lightweight adapters are tuned without adjusting the\npre-trained model. Besides, to guide effective adaptation, we propose a mutual\nnearest neighbor local feature loss, which ensures proper dense local features\nare produced for local matching and avoids time-consuming spatial verification\nin re-ranking. Experimental results show that our method outperforms the\nstate-of-the-art methods with less training data and training time, and uses\nabout only 3% retrieval runtime of the two-stage VPR methods with RANSAC-based\nspatial verification. It ranks 1st on the MSLS challenge leaderboard (at the\ntime of submission). The code is released at\nhttps://github.com/Lu-Feng/SelaVPR.\n","authors":["Feng Lu","Lijun Zhang","Xiangyuan Lan","Shuting Dong","Yaowei Wang","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2402.14505v3.pdf","comment":"ICLR2024"},{"id":"http://arxiv.org/abs/2404.02790v1","updated":"2024-04-03T14:58:00Z","published":"2024-04-03T14:58:00Z","title":"MULAN: A Multi Layer Annotated Dataset for Controllable Text-to-Image\n Generation","summary":" Text-to-image generation has achieved astonishing results, yet precise\nspatial controllability and prompt fidelity remain highly challenging. This\nlimitation is typically addressed through cumbersome prompt engineering, scene\nlayout conditioning, or image editing techniques which often require hand drawn\nmasks. Nonetheless, pre-existing works struggle to take advantage of the\nnatural instance-level compositionality of scenes due to the typically flat\nnature of rasterized RGB output images. Towards adressing this challenge, we\nintroduce MuLAn: a novel dataset comprising over 44K MUlti-Layer ANnotations of\nRGB images as multilayer, instance-wise RGBA decompositions, and over 100K\ninstance images. To build MuLAn, we developed a training free pipeline which\ndecomposes a monocular RGB image into a stack of RGBA layers comprising of\nbackground and isolated instances. We achieve this through the use of\npretrained general-purpose models, and by developing three modules: image\ndecomposition for instance discovery and extraction, instance completion to\nreconstruct occluded areas, and image re-assembly. We use our pipeline to\ncreate MuLAn-COCO and MuLAn-LAION datasets, which contain a variety of image\ndecompositions in terms of style, composition and complexity. With MuLAn, we\nprovide the first photorealistic resource providing instance decomposition and\nocclusion information for high quality images, opening up new avenues for\ntext-to-image generative AI research. With this, we aim to encourage the\ndevelopment of novel generation and editing technology, in particular\nlayer-wise solutions. MuLAn data resources are available at\nhttps://MuLAn-dataset.github.io/.\n","authors":["Petru-Daniel Tudosiu","Yongxin Yang","Shifeng Zhang","Fei Chen","Steven McDonagh","Gerasimos Lampouras","Ignacio Iacobacci","Sarah Parisot"],"pdf_url":"https://arxiv.org/pdf/2404.02790v1.pdf","comment":"CVPR 2024 - Project page: https://MuLAn-dataset.github.io/"},{"id":"http://arxiv.org/abs/2404.02788v1","updated":"2024-04-03T14:56:06Z","published":"2024-04-03T14:56:06Z","title":"GenN2N: Generative NeRF2NeRF Translation","summary":" We present GenN2N, a unified NeRF-to-NeRF translation framework for various\nNeRF translation tasks such as text-driven NeRF editing, colorization,\nsuper-resolution, inpainting, etc. Unlike previous methods designed for\nindividual translation tasks with task-specific schemes, GenN2N achieves all\nthese NeRF editing tasks by employing a plug-and-play image-to-image translator\nto perform editing in the 2D domain and lifting 2D edits into the 3D NeRF\nspace. Since the 3D consistency of 2D edits may not be assured, we propose to\nmodel the distribution of the underlying 3D edits through a generative model\nthat can cover all possible edited NeRFs. To model the distribution of 3D\nedited NeRFs from 2D edited images, we carefully design a VAE-GAN that encodes\nimages while decoding NeRFs. The latent space is trained to align with a\nGaussian distribution and the NeRFs are supervised through an adversarial loss\non its renderings. To ensure the latent code does not depend on 2D viewpoints\nbut truly reflects the 3D edits, we also regularize the latent code through a\ncontrastive learning scheme. Extensive experiments on various editing tasks\nshow GenN2N, as a universal framework, performs as well or better than\ntask-specific specialists while possessing flexible generative power. More\nresults on our project page: https://xiangyueliu.github.io/GenN2N/\n","authors":["Xiangyue Liu","Han Xue","Kunming Luo","Ping Tan","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2404.02788v1.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://xiangyueliu.github.io/GenN2N/"},{"id":"http://arxiv.org/abs/2404.02785v1","updated":"2024-04-03T14:55:17Z","published":"2024-04-03T14:55:17Z","title":"Domain Generalization through Meta-Learning: A Survey","summary":" Deep neural networks (DNNs) have revolutionized artificial intelligence but\noften lack performance when faced with out-of-distribution (OOD) data, a common\nscenario due to the inevitable domain shifts in real-world applications. This\nlimitation stems from the common assumption that training and testing data\nshare the same distribution-an assumption frequently violated in practice.\nDespite their effectiveness with large amounts of data and computational power,\nDNNs struggle with distributional shifts and limited labeled data, leading to\noverfitting and poor generalization across various tasks and domains.\nMeta-learning presents a promising approach by employing algorithms that\nacquire transferable knowledge across various tasks for fast adaptation,\neliminating the need to learn each task from scratch. This survey paper delves\ninto the realm of meta-learning with a focus on its contribution to domain\ngeneralization. We first clarify the concept of meta-learning for domain\ngeneralization and introduce a novel taxonomy based on the feature extraction\nstrategy and the classifier learning methodology, offering a granular view of\nmethodologies. Through an exhaustive review of existing methods and underlying\ntheories, we map out the fundamentals of the field. Our survey provides\npractical insights and an informed discussion on promising research directions,\npaving the way for future innovation in meta-learning for domain\ngeneralization.\n","authors":["Arsham Gholamzadeh Khoee","Yinan Yu","Robert Feldt"],"pdf_url":"https://arxiv.org/pdf/2404.02785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10206v2","updated":"2024-04-03T14:45:52Z","published":"2023-07-14T07:25:47Z","title":"NEAT: Distilling 3D Wireframes from Neural Attraction Fields","summary":" This paper studies the problem of structured 3D reconstruction using\nwireframes that consist of line segments and junctions, focusing on the\ncomputation of structured boundary geometries of scenes. Instead of leveraging\nmatching-based solutions from 2D wireframes (or line segments) for 3D wireframe\nreconstruction as done in prior arts, we present NEAT, a rendering-distilling\nformulation using neural fields to represent 3D line segments with 2D\nobservations, and bipartite matching for perceiving and distilling of a sparse\nset of 3D global junctions. The proposed {NEAT} enjoys the joint optimization\nof the neural fields and the global junctions from scratch, using\nview-dependent 2D observations without precomputed cross-view feature matching.\nComprehensive experiments on the DTU and BlendedMVS datasets demonstrate our\nNEAT's superiority over state-of-the-art alternatives for 3D wireframe\nreconstruction. Moreover, the distilled 3D global junctions by NEAT, are a\nbetter initialization than SfM points, for the recently-emerged 3D Gaussian\nSplatting for high-fidelity novel view synthesis using about 20 times fewer\ninitial 3D points. Project page: \\url{https://xuenan.net/neat}.\n","authors":["Nan Xue","Bin Tan","Yuxi Xiao","Liang Dong","Gui-Song Xia","Tianfu Wu","Yujun Shen"],"pdf_url":"https://arxiv.org/pdf/2307.10206v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.12865v3","updated":"2024-04-03T14:39:32Z","published":"2023-12-20T09:27:41Z","title":"RadEdit: stress-testing biomedical vision models via diffusion image\n editing","summary":" Biomedical imaging datasets are often small and biased, meaning that\nreal-world performance of predictive models can be substantially lower than\nexpected from internal testing. This work proposes using generative image\nediting to simulate dataset shifts and diagnose failure modes of biomedical\nvision models; this can be used in advance of deployment to assess readiness,\npotentially reducing cost and patient harm. Existing editing methods can\nproduce undesirable changes, with spurious correlations learned due to the\nco-occurrence of disease and treatment interventions, limiting practical\napplicability. To address this, we train a text-to-image diffusion model on\nmultiple chest X-ray datasets and introduce a new editing method RadEdit that\nuses multiple masks, if present, to constrain changes and ensure consistency in\nthe edited images. We consider three types of dataset shifts: acquisition\nshift, manifestation shift, and population shift, and demonstrate that our\napproach can diagnose failures and quantify model robustness without additional\ndata collection, complementing more qualitative tools for explainable AI.\n","authors":["Fernando Pérez-García","Sam Bond-Taylor","Pedro P. Sanchez","Boris van Breugel","Daniel C. Castro","Harshita Sharma","Valentina Salvatelli","Maria T. A. Wetscherek","Hannah Richardson","Matthew P. Lungren","Aditya Nori","Javier Alvarez-Valle","Ozan Oktay","Maximilian Ilse"],"pdf_url":"https://arxiv.org/pdf/2312.12865v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.06757v3","updated":"2024-04-03T14:31:43Z","published":"2022-11-12T22:29:42Z","title":"DriftRec: Adapting diffusion models to blind JPEG restoration","summary":" In this work, we utilize the high-fidelity generation abilities of diffusion\nmodels to solve blind JPEG restoration at high compression levels. We propose\nan elegant modification of the forward stochastic differential equation of\ndiffusion models to adapt them to this restoration task and name our method\nDriftRec. Comparing DriftRec against an $L_2$ regression baseline with the same\nnetwork architecture and state-of-the-art techniques for JPEG restoration, we\nshow that our approach can escape the tendency of other methods to generate\nblurry images, and recovers the distribution of clean images significantly more\nfaithfully. For this, only a dataset of clean/corrupted image pairs and no\nknowledge about the corruption operation is required, enabling wider\napplicability to other restoration tasks. In contrast to other conditional and\nunconditional diffusion models, we utilize the idea that the distributions of\nclean and corrupted images are much closer to each other than each is to the\nusual Gaussian prior of the reverse process in diffusion models. Our approach\ntherefore requires only low levels of added noise and needs comparatively few\nsampling steps even without further optimizations. We show that DriftRec\nnaturally generalizes to realistic and difficult scenarios such as unaligned\ndouble JPEG compression and blind restoration of JPEGs found online, without\nhaving encountered such examples during training.\n","authors":["Simon Welker","Henry N. Chapman","Timo Gerkmann"],"pdf_url":"https://arxiv.org/pdf/2211.06757v3.pdf","comment":"(C) 2024 IEEE. Personal use of this material is permitted. Permission\n from IEEE must be obtained for all other uses, in any current or future\n media, including reprinting/republishing this material for advertising or\n promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2312.02145v2","updated":"2024-04-03T14:14:18Z","published":"2023-12-04T18:59:13Z","title":"Repurposing Diffusion-Based Image Generators for Monocular Depth\n Estimation","summary":" Monocular depth estimation is a fundamental computer vision task. Recovering\n3D depth from a single image is geometrically ill-posed and requires scene\nunderstanding, so it is not surprising that the rise of deep learning has led\nto a breakthrough. The impressive progress of monocular depth estimators has\nmirrored the growth in model capacity, from relatively modest CNNs to large\nTransformer architectures. Still, monocular depth estimators tend to struggle\nwhen presented with images with unfamiliar content and layout, since their\nknowledge of the visual world is restricted by the data seen during training,\nand challenged by zero-shot generalization to new domains. This motivates us to\nexplore whether the extensive priors captured in recent generative diffusion\nmodels can enable better, more generalizable depth estimation. We introduce\nMarigold, a method for affine-invariant monocular depth estimation that is\nderived from Stable Diffusion and retains its rich prior knowledge. The\nestimator can be fine-tuned in a couple of days on a single GPU using only\nsynthetic training data. It delivers state-of-the-art performance across a wide\nrange of datasets, including over 20% performance gains in specific cases.\nProject page: https://marigoldmonodepth.github.io.\n","authors":["Bingxin Ke","Anton Obukhov","Shengyu Huang","Nando Metzger","Rodrigo Caye Daudt","Konrad Schindler"],"pdf_url":"https://arxiv.org/pdf/2312.02145v2.pdf","comment":"CVPR 2024 camera ready"},{"id":"http://arxiv.org/abs/2306.09320v4","updated":"2024-04-03T14:09:58Z","published":"2023-06-15T17:55:05Z","title":"Learnable Weight Initialization for Volumetric Medical Image\n Segmentation","summary":" Hybrid volumetric medical image segmentation models, combining the advantages\nof local convolution and global attention, have recently received considerable\nattention. While mainly focusing on architectural modifications, most existing\nhybrid approaches still use conventional data-independent weight initialization\nschemes which restrict their performance due to ignoring the inherent\nvolumetric nature of the medical data. To address this issue, we propose a\nlearnable weight initialization approach that utilizes the available medical\ntraining data to effectively learn the contextual and structural cues via the\nproposed self-supervised objectives. Our approach is easy to integrate into any\nhybrid model and requires no external training data. Experiments on multi-organ\nand lung cancer segmentation tasks demonstrate the effectiveness of our\napproach, leading to state-of-the-art segmentation performance. Our proposed\ndata-dependent initialization approach performs favorably as compared to the\nSwin-UNETR model pretrained using large-scale datasets on multi-organ\nsegmentation task. Our source code and models are available at:\nhttps://github.com/ShahinaKK/LWI-VMS.\n","authors":["Shahina Kunhimon","Abdelrahman Shaker","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2306.09320v4.pdf","comment":"Accepted at Elsevier AI in Medicine Journal"},{"id":"http://arxiv.org/abs/2404.02759v1","updated":"2024-04-03T14:05:39Z","published":"2024-04-03T14:05:39Z","title":"Unsupervised Occupancy Learning from Sparse Point Cloud","summary":" Implicit Neural Representations have gained prominence as a powerful\nframework for capturing complex data modalities, encompassing a wide range from\n3D shapes to images and audio. Within the realm of 3D shape representation,\nNeural Signed Distance Functions (SDF) have demonstrated remarkable potential\nin faithfully encoding intricate shape geometry. However, learning SDFs from 3D\npoint clouds in the absence of ground truth supervision remains a very\nchallenging task. In this paper, we propose a method to infer occupancy fields\ninstead of SDFs as they are easier to learn from sparse inputs. We leverage a\nmargin-based uncertainty measure to differentially sample from the decision\nboundary of the occupancy function and supervise the sampled boundary points\nusing the input point cloud. We further stabilize the optimization process at\nthe early stages of the training by biasing the occupancy function towards\nminimal entropy fields while maximizing its entropy at the input point cloud.\nThrough extensive experiments and evaluations, we illustrate the efficacy of\nour proposed method, highlighting its capacity to improve implicit shape\ninference with respect to baselines and the state-of-the-art using synthetic\nand real data.\n","authors":["Amine Ouasfi","Adnane Boukhayma"],"pdf_url":"https://arxiv.org/pdf/2404.02759v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02755v1","updated":"2024-04-03T13:57:08Z","published":"2024-04-03T13:57:08Z","title":"DIBS: Enhancing Dense Video Captioning with Unlabeled Videos via Pseudo\n Boundary Enrichment and Online Refinement","summary":" We present Dive Into the BoundarieS (DIBS), a novel pretraining framework for\ndense video captioning (DVC), that elaborates on improving the quality of the\ngenerated event captions and their associated pseudo event boundaries from\nunlabeled videos. By leveraging the capabilities of diverse large language\nmodels (LLMs), we generate rich DVC-oriented caption candidates and optimize\nthe corresponding pseudo boundaries under several meticulously designed\nobjectives, considering diversity, event-centricity, temporal ordering, and\ncoherence. Moreover, we further introduce a novel online boundary refinement\nstrategy that iteratively improves the quality of pseudo boundaries during\ntraining. Comprehensive experiments have been conducted to examine the\neffectiveness of the proposed technique components. By leveraging a substantial\namount of unlabeled video data, such as HowTo100M, we achieve a remarkable\nadvancement on standard DVC datasets like YouCook2 and ActivityNet. We\noutperform the previous state-of-the-art Vid2Seq across a majority of metrics,\nachieving this with just 0.4% of the unlabeled video data used for pre-training\nby Vid2Seq.\n","authors":["Hao Wu","Huabin Liu","Yu Qiao","Xiao Sun"],"pdf_url":"https://arxiv.org/pdf/2404.02755v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02747v1","updated":"2024-04-03T13:44:41Z","published":"2024-04-03T13:44:41Z","title":"Cross-Attention Makes Inference Cumbersome in Text-to-Image Diffusion\n Models","summary":" This study explores the role of cross-attention during inference in\ntext-conditional diffusion models. We find that cross-attention outputs\nconverge to a fixed point after few inference steps. Accordingly, the time\npoint of convergence naturally divides the entire inference process into two\nstages: an initial semantics-planning stage, during which, the model relies on\ncross-attention to plan text-oriented visual semantics, and a subsequent\nfidelity-improving stage, during which the model tries to generate images from\npreviously planned semantics. Surprisingly, ignoring text conditions in the\nfidelity-improving stage not only reduces computation complexity, but also\nmaintains model performance. This yields a simple and training-free method\ncalled TGATE for efficient generation, which caches the cross-attention output\nonce it converges and keeps it fixed during the remaining inference steps. Our\nempirical study on the MS-COCO validation set confirms its effectiveness. The\nsource code of TGATE is available at https://github.com/HaozheLiu-ST/T-GATE.\n","authors":["Wentian Zhang","Haozhe Liu","Jinheng Xie","Francesco Faccio","Mike Zheng Shou","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2404.02747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15528v2","updated":"2024-04-03T13:40:14Z","published":"2024-03-22T17:27:18Z","title":"Evaluating GPT-4 with Vision on Detection of Radiological Findings on\n Chest Radiographs","summary":" The study examines the application of GPT-4V, a multi-modal large language\nmodel equipped with visual recognition, in detecting radiological findings from\na set of 100 chest radiographs and suggests that GPT-4V is currently not ready\nfor real-world diagnostic usage in interpreting chest radiographs.\n","authors":["Yiliang Zhou","Hanley Ong","Patrick Kennedy","Carol Wu","Jacob Kazam","Keith Hentel","Adam Flanders","George Shih","Yifan Peng"],"pdf_url":"https://arxiv.org/pdf/2403.15528v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02742v1","updated":"2024-04-03T13:39:29Z","published":"2024-04-03T13:39:29Z","title":"LiDAR4D: Dynamic Neural Fields for Novel Space-time View LiDAR Synthesis","summary":" Although neural radiance fields (NeRFs) have achieved triumphs in image novel\nview synthesis (NVS), LiDAR NVS remains largely unexplored. Previous LiDAR NVS\nmethods employ a simple shift from image NVS methods while ignoring the dynamic\nnature and the large-scale reconstruction problem of LiDAR point clouds. In\nlight of this, we propose LiDAR4D, a differentiable LiDAR-only framework for\nnovel space-time LiDAR view synthesis. In consideration of the sparsity and\nlarge-scale characteristics, we design a 4D hybrid representation combined with\nmulti-planar and grid features to achieve effective reconstruction in a\ncoarse-to-fine manner. Furthermore, we introduce geometric constraints derived\nfrom point clouds to improve temporal consistency. For the realistic synthesis\nof LiDAR point clouds, we incorporate the global optimization of ray-drop\nprobability to preserve cross-region patterns. Extensive experiments on\nKITTI-360 and NuScenes datasets demonstrate the superiority of our method in\naccomplishing geometry-aware and time-consistent dynamic reconstruction. Codes\nare available at https://github.com/ispc-lab/LiDAR4D.\n","authors":["Zehan Zheng","Fan Lu","Weiyi Xue","Guang Chen","Changjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.02742v1.pdf","comment":"Accepted by CVPR 2024. Project Page:\n https://dyfcalid.github.io/LiDAR4D"},{"id":"http://arxiv.org/abs/2404.02135v2","updated":"2024-04-03T13:36:38Z","published":"2024-04-02T17:48:46Z","title":"ResNet with Integrated Convolutional Block Attention Module for Ship\n Classification Using Transfer Learning on Optical Satellite Imagery","summary":" This study proposes a novel transfer learning framework for effective ship\nclassification using high-resolution optical remote sensing satellite imagery.\nThe framework is based on the deep convolutional neural network model ResNet50\nand incorporates the Convolutional Block Attention Module (CBAM) to enhance\nperformance. CBAM enables the model to attend to salient features in the\nimages, allowing it to better discriminate between subtle differences between\nships and backgrounds. Furthermore, this study adopts a transfer learning\napproach tailored for accurately classifying diverse types of ships by\nfine-tuning a pre-trained model for the specific task. Experimental results\ndemonstrate the efficacy of the proposed framework in ship classification using\noptical remote sensing imagery, achieving a high classification accuracy of 94%\nacross 5 classes, outperforming existing methods. This research holds potential\napplications in maritime surveillance and management, illegal fishing\ndetection, and maritime traffic monitoring.\n","authors":["Ryan Donghan Kwon","Gangjoo Robin Nam","Jisoo Tak","Yeom Hyeok","Junseob Shin","Hyerin Cha","Kim Soo Bin"],"pdf_url":"https://arxiv.org/pdf/2404.02135v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02738v1","updated":"2024-04-03T13:35:51Z","published":"2024-04-03T13:35:51Z","title":"Adaptive Affinity-Based Generalization For MRI Imaging Segmentation\n Across Resource-Limited Settings","summary":" The joint utilization of diverse data sources for medical imaging\nsegmentation has emerged as a crucial area of research, aiming to address\nchallenges such as data heterogeneity, domain shift, and data quality\ndiscrepancies. Integrating information from multiple data domains has shown\npromise in improving model generalizability and adaptability. However, this\napproach often demands substantial computational resources, hindering its\npracticality. In response, knowledge distillation (KD) has garnered attention\nas a solution. KD involves training light-weight models to emulate the behavior\nof more resource-intensive models, thereby mitigating the computational burden\nwhile maintaining performance. This paper addresses the pressing need to\ndevelop a lightweight and generalizable model for medical imaging segmentation\nthat can effectively handle data integration challenges. Our proposed approach\nintroduces a novel relation-based knowledge framework by seamlessly combining\nadaptive affinity-based and kernel-based distillation through a gram matrix\nthat can capture the style representation across features. This methodology\nempowers the student model to accurately replicate the feature representations\nof the teacher model, facilitating robust performance even in the face of\ndomain shift and data heterogeneity. To validate our innovative approach, we\nconducted experiments on publicly available multi-source prostate MRI data. The\nresults demonstrate a significant enhancement in segmentation performance using\nlightweight networks. Notably, our method achieves this improvement while\nreducing both inference time and storage usage, rendering it a practical and\nefficient solution for real-time medical imaging segmentation.\n","authors":["Eddardaa B. Loussaief","Mohammed Ayad","Domenc Puig","Hatem A. Rashwan"],"pdf_url":"https://arxiv.org/pdf/2404.02738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02733v1","updated":"2024-04-03T13:34:09Z","published":"2024-04-03T13:34:09Z","title":"InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image\n Generation","summary":" Tuning-free diffusion-based models have demonstrated significant potential in\nthe realm of image personalization and customization. However, despite this\nnotable progress, current models continue to grapple with several complex\nchallenges in producing style-consistent image generation. Firstly, the concept\nof style is inherently underdetermined, encompassing a multitude of elements\nsuch as color, material, atmosphere, design, and structure, among others.\nSecondly, inversion-based methods are prone to style degradation, often\nresulting in the loss of fine-grained details. Lastly, adapter-based approaches\nfrequently require meticulous weight tuning for each reference image to achieve\na balance between style intensity and text controllability. In this paper, we\ncommence by examining several compelling yet frequently overlooked\nobservations. We then proceed to introduce InstantStyle, a framework designed\nto address these issues through the implementation of two key strategies: 1) A\nstraightforward mechanism that decouples style and content from reference\nimages within the feature space, predicated on the assumption that features\nwithin the same space can be either added to or subtracted from one another. 2)\nThe injection of reference image features exclusively into style-specific\nblocks, thereby preventing style leaks and eschewing the need for cumbersome\nweight tuning, which often characterizes more parameter-heavy designs.Our work\ndemonstrates superior visual stylization outcomes, striking an optimal balance\nbetween the intensity of style and the controllability of textual elements. Our\ncodes will be available at https://github.com/InstantStyle/InstantStyle.\n","authors":["Haofan Wang","Qixun Wang","Xu Bai","Zekui Qin","Anthony Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02733v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2404.02731v1","updated":"2024-04-03T13:30:56Z","published":"2024-04-03T13:30:56Z","title":"Event Camera Demosaicing via Swin Transformer and Pixel-focus Loss","summary":" Recent research has highlighted improvements in high-quality imaging guided\nby event cameras, with most of these efforts concentrating on the RGB domain.\nHowever, these advancements frequently neglect the unique challenges introduced\nby the inherent flaws in the sensor design of event cameras in the RAW domain.\nSpecifically, this sensor design results in the partial loss of pixel values,\nposing new challenges for RAW domain processes like demosaicing. The challenge\nintensifies as most research in the RAW domain is based on the premise that\neach pixel contains a value, making the straightforward adaptation of these\nmethods to event camera demosaicing problematic. To end this, we present a\nSwin-Transformer-based backbone and a pixel-focus loss function for demosaicing\nwith missing pixel values in RAW domain processing. Our core motivation is to\nrefine a general and widely applicable foundational model from the RGB domain\nfor RAW domain processing, thereby broadening the model's applicability within\nthe entire imaging process. Our method harnesses multi-scale processing and\nspace-to-depth techniques to ensure efficiency and reduce computing complexity.\nWe also proposed the Pixel-focus Loss function for network fine-tuning to\nimprove network convergence based on our discovery of a long-tailed\ndistribution in training loss. Our method has undergone validation on the MIPI\nDemosaic Challenge dataset, with subsequent analytical experimentation\nconfirming its efficacy. All code and trained models are released here:\nhttps://github.com/yunfanLu/ev-demosaic\n","authors":["Yunfan Lu","Yijie Xu","Wenzong Ma","Weiyu Guo","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2404.02731v1.pdf","comment":"Accepted for the CVPR 2024 Workshop on Mobile Intelligent Photography\n & Imaging"},{"id":"http://arxiv.org/abs/2404.02726v1","updated":"2024-04-03T13:27:54Z","published":"2024-04-03T13:27:54Z","title":"Harnessing the Power of Large Vision Language Models for Synthetic Image\n Detection","summary":" In recent years, the emergence of models capable of generating images from\ntext has attracted considerable interest, offering the possibility of creating\nrealistic images from text descriptions. Yet these advances have also raised\nconcerns about the potential misuse of these images, including the creation of\nmisleading content such as fake news and propaganda. This study investigates\nthe effectiveness of using advanced vision-language models (VLMs) for synthetic\nimage identification. Specifically, the focus is on tuning state-of-the-art\nimage captioning models for synthetic image detection. By harnessing the robust\nunderstanding capabilities of large VLMs, the aim is to distinguish authentic\nimages from synthetic images produced by diffusion-based models. This study\ncontributes to the advancement of synthetic image detection by exploiting the\ncapabilities of visual language models such as BLIP-2 and ViTGPT2. By tailoring\nimage captioning models, we address the challenges associated with the\npotential misuse of synthetic images in real-world applications. Results\ndescribed in this paper highlight the promising role of VLMs in the field of\nsynthetic image detection, outperforming conventional image-based detection\ntechniques. Code and models can be found at\nhttps://github.com/Mamadou-Keita/VLM-DETECT.\n","authors":["Mamadou Keita","Wassim Hamidouche","Hassen Bougueffa","Abdenour Hadid","Abdelmalik Taleb-Ahmed"],"pdf_url":"https://arxiv.org/pdf/2404.02726v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2404.01959"},{"id":"http://arxiv.org/abs/2312.09056v2","updated":"2024-04-03T13:09:27Z","published":"2023-12-14T15:53:07Z","title":"ReCoRe: Regularized Contrastive Representation Learning of World Model","summary":" While recent model-free Reinforcement Learning (RL) methods have demonstrated\nhuman-level effectiveness in gaming environments, their success in everyday\ntasks like visual navigation has been limited, particularly under significant\nappearance variations. This limitation arises from (i) poor sample efficiency\nand (ii) over-fitting to training scenarios. To address these challenges, we\npresent a world model that learns invariant features using (i) contrastive\nunsupervised learning and (ii) an intervention-invariant regularizer. Learning\nan explicit representation of the world dynamics i.e. a world model, improves\nsample efficiency while contrastive learning implicitly enforces learning of\ninvariant features, which improves generalization. However, the na\\\"ive\nintegration of contrastive loss to world models is not good enough, as\nworld-model-based RL methods independently optimize representation learning and\nagent policy. To overcome this issue, we propose an intervention-invariant\nregularizer in the form of an auxiliary task such as depth prediction, image\ndenoising, image segmentation, etc., that explicitly enforces invariance to\nstyle interventions. Our method outperforms current state-of-the-art\nmodel-based and model-free RL methods and significantly improves on\nout-of-distribution point navigation tasks evaluated on the iGibson benchmark.\nWith only visual observations, we further demonstrate that our approach\noutperforms recent language-guided foundation models for point navigation,\nwhich is essential for deployment on robots with limited computation\ncapabilities. Finally, we demonstrate that our proposed model excels at the\nsim-to-real transfer of its perception module on the Gibson benchmark.\n","authors":["Rudra P. K. Poudel","Harit Pandya","Stephan Liwicki","Roberto Cipolla"],"pdf_url":"https://arxiv.org/pdf/2312.09056v2.pdf","comment":"Accepted at CVPR 2024. arXiv admin note: text overlap with\n arXiv:2209.14932"},{"id":"http://arxiv.org/abs/2403.13352v3","updated":"2024-04-03T13:08:55Z","published":"2024-03-20T07:31:07Z","title":"AGFSync: Leveraging AI-Generated Feedback for Preference Optimization in\n Text-to-Image Generation","summary":" Text-to-Image (T2I) diffusion models have achieved remarkable success in\nimage generation. Despite their progress, challenges remain in both\nprompt-following ability, image quality and lack of high-quality datasets,\nwhich are essential for refining these models. As acquiring labeled data is\ncostly, we introduce AGFSync, a framework that enhances T2I diffusion models\nthrough Direct Preference Optimization (DPO) in a fully AI-driven approach.\nAGFSync utilizes Vision-Language Models (VLM) to assess image quality across\nstyle, coherence, and aesthetics, generating feedback data within an AI-driven\nloop. By applying AGFSync to leading T2I models such as SD v1.4, v1.5, and\nSDXL, our extensive experiments on the TIFA dataset demonstrate notable\nimprovements in VQA scores, aesthetic evaluations, and performance on the HPSv2\nbenchmark, consistently outperforming the base models. AGFSync's method of\nrefining T2I diffusion models paves the way for scalable alignment techniques.\n","authors":["Jingkun An","Yinghao Zhu","Zongjian Li","Haoran Feng","Bohua Chen","Yemin Shi","Chengwei Pan"],"pdf_url":"https://arxiv.org/pdf/2403.13352v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05073v4","updated":"2024-04-03T13:07:55Z","published":"2023-09-10T16:42:11Z","title":"FreeMan: Towards Benchmarking 3D Human Pose Estimation under Real-World\n Conditions","summary":" Estimating the 3D structure of the human body from natural scenes is a\nfundamental aspect of visual perception. 3D human pose estimation is a vital\nstep in advancing fields like AIGC and human-robot interaction, serving as a\ncrucial technique for understanding and interacting with human actions in\nreal-world settings. However, the current datasets, often collected under\nsingle laboratory conditions using complex motion capture equipment and\nunvarying backgrounds, are insufficient. The absence of datasets on variable\nconditions is stalling the progress of this crucial task. To facilitate the\ndevelopment of 3D pose estimation, we present FreeMan, the first large-scale,\nmulti-view dataset collected under the real-world conditions. FreeMan was\ncaptured by synchronizing 8 smartphones across diverse scenarios. It comprises\n11M frames from 8000 sequences, viewed from different perspectives. These\nsequences cover 40 subjects across 10 different scenarios, each with varying\nlighting conditions. We have also established an semi-automated pipeline\ncontaining error detection to reduce the workload of manual check and ensure\nprecise annotation. We provide comprehensive evaluation baselines for a range\nof tasks, underlining the significant challenges posed by FreeMan. Further\nevaluations of standard indoor/outdoor human sensing datasets reveal that\nFreeMan offers robust representation transferability in real and complex\nscenes. Code and data are available at https://wangjiongw.github.io/freeman.\n","authors":["Jiong Wang","Fengyu Yang","Wenbo Gou","Bingliang Li","Danqi Yan","Ailing Zeng","Yijun Gao","Junle Wang","Yanqing Jing","Ruimao Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.05073v4.pdf","comment":"CVPR2024 camera ready version. 19 pages, 16 figures. Project page:\n https://wangjiongw.github.io/freeman/ ; API:\n https://github.com/wangjiongw/FreeMan_API"},{"id":"http://arxiv.org/abs/2404.02697v1","updated":"2024-04-03T12:54:16Z","published":"2024-04-03T12:54:16Z","title":"Model-agnostic Origin Attribution of Generated Images with Few-shot\n Examples","summary":" Recent progress in visual generative models enables the generation of\nhigh-quality images. To prevent the misuse of generated images, it is important\nto identify the origin model that generates them. In this work, we study the\norigin attribution of generated images in a practical setting where only a few\nimages generated by a source model are available and the source model cannot be\naccessed. The goal is to check if a given image is generated by the source\nmodel. We first formulate this problem as a few-shot one-class classification\ntask. To solve the task, we propose OCC-CLIP, a CLIP-based framework for\nfew-shot one-class classification, enabling the identification of an image's\nsource model, even among multiple candidates. Extensive experiments\ncorresponding to various generative models verify the effectiveness of our\nOCC-CLIP framework. Furthermore, an experiment based on the recently released\nDALL-E 3 API verifies the real-world applicability of our solution.\n","authors":["Fengyuan Liu","Haochen Luo","Yiming Li","Philip Torr","Jindong Gu"],"pdf_url":"https://arxiv.org/pdf/2404.02697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12685v2","updated":"2024-04-03T12:47:15Z","published":"2023-09-22T07:51:17Z","title":"eWand: A calibration framework for wide baseline frame-based and\n event-based camera systems","summary":" Accurate calibration is crucial for using multiple cameras to triangulate the\nposition of objects precisely. However, it is also a time-consuming process\nthat needs to be repeated for every displacement of the cameras. The standard\napproach is to use a printed pattern with known geometry to estimate the\nintrinsic and extrinsic parameters of the cameras. The same idea can be applied\nto event-based cameras, though it requires extra work. By using frame\nreconstruction from events, a printed pattern can be detected. A blinking\npattern can also be displayed on a screen. Then, the pattern can be directly\ndetected from the events. Such calibration methods can provide accurate\nintrinsic calibration for both frame- and event-based cameras. However, using\n2D patterns has several limitations for multi-camera extrinsic calibration,\nwith cameras possessing highly different points of view and a wide baseline.\nThe 2D pattern can only be detected from one direction and needs to be of\nsignificant size to compensate for its distance to the camera. This makes the\nextrinsic calibration time-consuming and cumbersome. To overcome these\nlimitations, we propose eWand, a new method that uses blinking LEDs inside\nopaque spheres instead of a printed or displayed pattern. Our method provides a\nfaster, easier-to-use extrinsic calibration approach that maintains high\naccuracy for both event- and frame-based cameras.\n","authors":["Thomas Gossard","Andreas Ziegler","Levin Kolmar","Jonas Tebbe","Andreas Zell"],"pdf_url":"https://arxiv.org/pdf/2309.12685v2.pdf","comment":"Accepted for 2024 IEEE International Conference on Robotics and\n Automation (ICRA 2024). Project web page:\n https://cogsys-tuebingen.github.io/ewand/"},{"id":"http://arxiv.org/abs/2401.05827v2","updated":"2024-04-03T12:42:32Z","published":"2024-01-11T10:52:17Z","title":"Hallucination Benchmark in Medical Visual Question Answering","summary":" The recent success of large language and vision models (LLVMs) on vision\nquestion answering (VQA), particularly their applications in medicine\n(Med-VQA), has shown a great potential of realizing effective visual assistants\nfor healthcare. However, these models are not extensively tested on the\nhallucination phenomenon in clinical settings. Here, we created a hallucination\nbenchmark of medical images paired with question-answer sets and conducted a\ncomprehensive evaluation of the state-of-the-art models. The study provides an\nin-depth analysis of current models' limitations and reveals the effectiveness\nof various prompting strategies.\n","authors":["Jinge Wu","Yunsoo Kim","Honghan Wu"],"pdf_url":"https://arxiv.org/pdf/2401.05827v2.pdf","comment":"Accepted to ICLR 2024 Tiny Papers(Notable)"},{"id":"http://arxiv.org/abs/2404.02686v1","updated":"2024-04-03T12:32:13Z","published":"2024-04-03T12:32:13Z","title":"Design2Cloth: 3D Cloth Generation from 2D Masks","summary":" In recent years, there has been a significant shift in the field of digital\navatar research, towards modeling, animating and reconstructing clothed human\nrepresentations, as a key step towards creating realistic avatars. However,\ncurrent 3D cloth generation methods are garment specific or trained completely\non synthetic data, hence lacking fine details and realism. In this work, we\nmake a step towards automatic realistic garment design and propose\nDesign2Cloth, a high fidelity 3D generative model trained on a real world\ndataset from more than 2000 subject scans. To provide vital contribution to the\nfashion industry, we developed a user-friendly adversarial model capable of\ngenerating diverse and detailed clothes simply by drawing a 2D cloth mask.\nUnder a series of both qualitative and quantitative experiments, we showcase\nthat Design2Cloth outperforms current state-of-the-art cloth generative models\nby a large margin. In addition to the generative properties of our network, we\nshowcase that the proposed method can be used to achieve high quality\nreconstructions from single in-the-wild images and 3D scans. Dataset, code and\npre-trained model will become publicly available.\n","authors":["Jiali Zheng","Rolandos Alexandros Potamias","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2404.02686v1.pdf","comment":"Accepted to CVPR 2024, Project page:\n https://jiali-zheng.github.io/Design2Cloth/"},{"id":"http://arxiv.org/abs/2404.02678v1","updated":"2024-04-03T12:21:41Z","published":"2024-04-03T12:21:41Z","title":"Independently Keypoint Learning for Small Object Semantic Correspondence","summary":" Semantic correspondence remains a challenging task for establishing\ncorrespondences between a pair of images with the same category or similar\nscenes due to the large intra-class appearance. In this paper, we introduce a\nnovel problem called 'Small Object Semantic Correspondence (SOSC).' This\nproblem is challenging due to the close proximity of keypoints associated with\nsmall objects, which results in the fusion of these respective features. It is\ndifficult to identify the corresponding key points of the fused features, and\nit is also difficult to be recognized. To address this challenge, we propose\nthe Keypoint Bounding box-centered Cropping (KBC) method, which aims to\nincrease the spatial separation between keypoints of small objects, thereby\nfacilitating independent learning of these keypoints. The KBC method is\nseamlessly integrated into our proposed inference pipeline and can be easily\nincorporated into other methodologies, resulting in significant performance\nenhancements. Additionally, we introduce a novel framework, named KBCNet, which\nserves as our baseline model. KBCNet comprises a Cross-Scale Feature Alignment\n(CSFA) module and an efficient 4D convolutional decoder. The CSFA module is\ndesigned to align multi-scale features, enriching keypoint representations by\nintegrating fine-grained features and deep semantic features. Meanwhile, the 4D\nconvolutional decoder, based on efficient 4D convolution, ensures efficiency\nand rapid convergence. To empirically validate the effectiveness of our\nproposed methodology, extensive experiments are conducted on three widely used\nbenchmarks: PF-PASCAL, PF-WILLOW, and SPair-71k. Our KBC method demonstrates a\nsubstantial performance improvement of 7.5\\% on the SPair-71K dataset,\nproviding compelling evidence of its efficacy.\n","authors":["Hailong Jin","Huiying Li"],"pdf_url":"https://arxiv.org/pdf/2404.02678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16926v5","updated":"2024-04-03T12:08:08Z","published":"2023-11-28T16:31:27Z","title":"LLaFS: When Large Language Models Meet Few-Shot Segmentation","summary":" This paper proposes LLaFS, the first attempt to leverage large language\nmodels (LLMs) in few-shot segmentation. In contrast to the conventional\nfew-shot segmentation methods that only rely on the limited and biased\ninformation from the annotated support images, LLaFS leverages the vast prior\nknowledge gained by LLM as an effective supplement and directly uses the LLM to\nsegment images in a few-shot manner. To enable the text-based LLM to handle\nimage-related tasks, we carefully design an input instruction that allows the\nLLM to produce segmentation results represented as polygons, and propose a\nregion-attribute table to simulate the human visual mechanism and provide\nmulti-modal guidance. We also synthesize pseudo samples and use curriculum\nlearning for pretraining to augment data and achieve better optimization. LLaFS\nachieves state-of-the-art results on multiple datasets, showing the potential\nof using LLMs for few-shot computer vision tasks.\n","authors":["Lanyun Zhu","Tianrun Chen","Deyi Ji","Jieping Ye","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16926v5.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2404.02668v1","updated":"2024-04-03T12:06:01Z","published":"2024-04-03T12:06:01Z","title":"RS-Mamba for Large Remote Sensing Image Dense Prediction","summary":" The spatial resolution of remote sensing images is becoming increasingly\nhigher, posing challenges in handling large very-high-resolution (VHR) remote\nsensing images for dense prediction tasks. Models based on convolutional neural\nnetworks are limited in their ability to model global features of remote\nsensing images due to local convolution operations. Transformer based models,\ndespite their global modeling capabilities, face computational challenges with\nlarge VHR images due to their quadratic complexity. The common practice of\ncropping large images into smaller patches leads to a significant loss of\ncontextual information. To address these issues, we propose the Remote Sensing\nMamba (RSM) for dense prediction tasks in VHR remote sensing. RSM is designed\nto model global features of remote sensing images with linear complexity,\nenabling it to process large VHR images effectively. It employs an\nomnidirectional selective scan module to globally model the images in multiple\ndirections, capturing large spatial features from various directions.\nExperiments on semantic segmentation and change detection tasks across various\nobjects demonstrate the effectiveness of RSM. With simple model architecture\nand training approach, RSM achieves state-of-the-art performance on the dense\nprediction tasks of VHR remote sensing. The code for this work will be\navailable at https://github.com/walking-shadow/Official_Remote_Sensing_Mamba.\n","authors":["Sijie Zhao","Hao Chen","Xueliang Zhang","Pengfeng Xiao","Lei Bai","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2404.02668v1.pdf","comment":"13 pages,6 figures"},{"id":"http://arxiv.org/abs/2404.02659v1","updated":"2024-04-03T11:47:20Z","published":"2024-04-03T11:47:20Z","title":"A Satellite Band Selection Framework for Amazon Forest Deforestation\n Detection Task","summary":" The conservation of tropical forests is a topic of significant social and\necological relevance due to their crucial role in the global ecosystem.\nUnfortunately, deforestation and degradation impact millions of hectares\nannually, necessitating government or private initiatives for effective forest\nmonitoring. This study introduces a novel framework that employs the Univariate\nMarginal Distribution Algorithm (UMDA) to select spectral bands from Landsat-8\nsatellite, optimizing the representation of deforested areas. This selection\nguides a semantic segmentation architecture, DeepLabv3+, enhancing its\nperformance. Experimental results revealed several band compositions that\nachieved superior balanced accuracy compared to commonly adopted combinations\nfor deforestation detection, utilizing segment classification via a Support\nVector Machine (SVM). Moreover, the optimal band compositions identified by the\nUMDA-based approach improved the performance of the DeepLabv3+ architecture,\nsurpassing state-of-the-art approaches compared in this study. The observation\nthat a few selected bands outperform the total contradicts the data-driven\nparadigm prevalent in the deep learning field. Therefore, this suggests an\nexception to the conventional wisdom that 'more is always better'.\n","authors":["Eduardo Neto","Fabio A. Faria","Amanda A. S. de Oliveira","Álvaro L. Fazenda"],"pdf_url":"https://arxiv.org/pdf/2404.02659v1.pdf","comment":"9 pages, 4 figures, paper accepted for presentation at GECCO 2024"},{"id":"http://arxiv.org/abs/2304.08069v3","updated":"2024-04-03T11:46:48Z","published":"2023-04-17T08:30:02Z","title":"DETRs Beat YOLOs on Real-time Object Detection","summary":" The YOLO series has become the most popular framework for real-time object\ndetection due to its reasonable trade-off between speed and accuracy. However,\nwe observe that the speed and accuracy of YOLOs are negatively affected by the\nNMS. Recently, end-to-end Transformer-based detectors (DETRs) have provided an\nalternative to eliminating NMS. Nevertheless, the high computational cost\nlimits their practicality and hinders them from fully exploiting the advantage\nof excluding NMS. In this paper, we propose the Real-Time DEtection TRansformer\n(RT-DETR), the first real-time end-to-end object detector to our best knowledge\nthat addresses the above dilemma. We build RT-DETR in two steps, drawing on the\nadvanced DETR: first we focus on maintaining accuracy while improving speed,\nfollowed by maintaining speed while improving accuracy. Specifically, we design\nan efficient hybrid encoder to expeditiously process multi-scale features by\ndecoupling intra-scale interaction and cross-scale fusion to improve speed.\nThen, we propose the uncertainty-minimal query selection to provide\nhigh-quality initial queries to the decoder, thereby improving accuracy. In\naddition, RT-DETR supports flexible speed tuning by adjusting the number of\ndecoder layers to adapt to various scenarios without retraining. Our\nRT-DETR-R50 / R101 achieves 53.1% / 54.3% AP on COCO and 108 / 74 FPS on T4\nGPU, outperforming previously advanced YOLOs in both speed and accuracy. We\nalso develop scaled RT-DETRs that outperform the lighter YOLO detectors (S and\nM models). Furthermore, RT-DETR-R50 outperforms DINO-R50 by 2.2% AP in accuracy\nand about 21 times in FPS. After pre-training with Objects365, RT-DETR-R50 /\nR101 achieves 55.3% / 56.2% AP. The project page:\nhttps://zhao-yian.github.io/RTDETR.\n","authors":["Yian Zhao","Wenyu Lv","Shangliang Xu","Jinman Wei","Guanzhong Wang","Qingqing Dang","Yi Liu","Jie Chen"],"pdf_url":"https://arxiv.org/pdf/2304.08069v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02656v1","updated":"2024-04-03T11:37:03Z","published":"2024-04-03T11:37:03Z","title":"Non-negative Subspace Feature Representation for Few-shot Learning in\n Medical Imaging","summary":" Unlike typical visual scene recognition domains, in which massive datasets\nare accessible to deep neural networks, medical image interpretations are often\nobstructed by the paucity of data. In this paper, we investigate the\neffectiveness of data-based few-shot learning in medical imaging by exploring\ndifferent data attribute representations in a low-dimensional space. We\nintroduce different types of non-negative matrix factorization (NMF) in\nfew-shot learning, addressing the data scarcity issue in medical image\nclassification. Extensive empirical studies are conducted in terms of\nvalidating the effectiveness of NMF, especially its supervised variants (e.g.,\ndiscriminative NMF, and supervised and constrained NMF with sparseness), and\nthe comparison with principal component analysis (PCA), i.e., the collaborative\nrepresentation-based dimensionality reduction technique derived from\neigenvectors. With 14 different datasets covering 11 distinct illness\ncategories, thorough experimental results and comparison with related\ntechniques demonstrate that NMF is a competitive alternative to PCA for\nfew-shot learning in medical imaging, and the supervised NMF algorithms are\nmore discriminative in the subspace with greater effectiveness. Furthermore, we\nshow that the part-based representation of NMF, especially its supervised\nvariants, is dramatically impactful in detecting lesion areas in medical\nimaging with limited samples.\n","authors":["Keqiang Fan","Xiaohao Cai","Mahesan Niranjan"],"pdf_url":"https://arxiv.org/pdf/2404.02656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11389v2","updated":"2024-04-03T11:33:14Z","published":"2023-03-20T18:49:39Z","title":"Creating Ensembles of Classifiers through UMDA for Aerial Scene\n Classification","summary":" Aerial scene classification, which aims to semantically label remote sensing\nimages in a set of predefined classes (e.g., agricultural, beach, and harbor),\nis a very challenging task in remote sensing due to high intra-class\nvariability and the different scales and orientations of the objects present in\nthe dataset images. In remote sensing area, the use of CNN architectures as an\nalternative solution is also a reality for scene classification tasks.\nGenerally, these CNNs are used to perform the traditional image classification\ntask. However, another less used way to classify remote sensing image might be\nthe one that uses deep metric learning (DML) approaches. In this sense, this\nwork proposes to employ six DML approaches for aerial scene classification\ntasks, analysing their behave with four different pre-trained CNNs as well as\ncombining them through the use of evolutionary computation algorithm (UMDA). In\nperformed experiments, it is possible to observe than DML approaches can\nachieve the best classification results when compared to traditional\npre-trained CNNs for three well-known remote sensing aerial scene datasets. In\naddition, the UMDA algorithm proved to be a promising strategy to combine DML\napproaches when there is diversity among them, managing to improve at least\n5.6% of accuracy in the classification results using almost 50\\% of the\navailable classifiers for the construction of the final ensemble of\nclassifiers.\n","authors":["Fabio A. Faria","Luiz H. Buris","Luis A. M. Pereira","Fábio A. M. Cappabianco"],"pdf_url":"https://arxiv.org/pdf/2303.11389v2.pdf","comment":"9 pages, 4 figures, accepted for presentation at the GECCO2024"},{"id":"http://arxiv.org/abs/2401.15204v4","updated":"2024-04-03T11:12:16Z","published":"2024-01-26T21:02:44Z","title":"LYT-Net: Lightweight YUV Transformer-based Network for Low-Light Image\n Enhancement","summary":" In recent years, deep learning-based solutions have proven successful in the\ndomains of image enhancement. This paper introduces LYT-Net, or Lightweight YUV\nTransformer-based Network, as a novel approach for low-light image enhancement.\nThe proposed architecture, distinct from conventional Retinex-based models,\nleverages the YUV color space's natural separation of luminance (Y) and\nchrominance (U and V) to simplify the intricate task of disentangling light and\ncolor information in images. By utilizing the strengths of transformers, known\nfor their capability to capture long-range dependencies, LYT-Net ensures a\ncomprehensive contextual understanding of the image while maintaining reduced\nmodel complexity. By employing a novel hybrid loss function, our proposed\nmethod achieves state-of-the-art results on low-light image enhancement\ndatasets, all while being considerably more compact than its counterparts. The\nsource code and pre-trained models are available at\nhttps://github.com/albrateanu/LYT-Net\n","authors":["A. Brateanu","R. Balmez","A. Avram","C. Orhei"],"pdf_url":"https://arxiv.org/pdf/2401.15204v4.pdf","comment":"10 pages, 6 figures, submitted to ICIP"},{"id":"http://arxiv.org/abs/2212.05315v3","updated":"2024-04-03T11:03:52Z","published":"2022-12-10T14:49:24Z","title":"Mind The Edge: Refining Depth Edges in Sparsely-Supervised Monocular\n Depth Estimation","summary":" Monocular Depth Estimation (MDE) is a fundamental problem in computer vision\nwith numerous applications. Recently, LIDAR-supervised methods have achieved\nremarkable per-pixel depth accuracy in outdoor scenes. However, significant\nerrors are typically found in the proximity of depth discontinuities, i.e.,\ndepth edges, which often hinder the performance of depth-dependent applications\nthat are sensitive to such inaccuracies, e.g., novel view synthesis and\naugmented reality. Since direct supervision for the location of depth edges is\ntypically unavailable in sparse LIDAR-based scenes, encouraging the MDE model\nto produce correct depth edges is not straightforward. To the best of our\nknowledge this paper is the first attempt to address the depth edges issue for\nLIDAR-supervised scenes. In this work we propose to learn to detect the\nlocation of depth edges from densely-supervised synthetic data, and use it to\ngenerate supervision for the depth edges in the MDE training. To quantitatively\nevaluate our approach, and due to the lack of depth edges GT in LIDAR-based\nscenes, we manually annotated subsets of the KITTI and the DDAD datasets with\ndepth edges ground truth. We demonstrate significant gains in the accuracy of\nthe depth edges with comparable per-pixel depth accuracy on several challenging\ndatasets. Code and datasets are available at\n\\url{https://github.com/liortalker/MindTheEdge}.\n","authors":["Lior Talker","Aviad Cohen","Erez Yosef","Alexandra Dana","Michael Dinerstein"],"pdf_url":"https://arxiv.org/pdf/2212.05315v3.pdf","comment":"Appears in CVPR24'"},{"id":"http://arxiv.org/abs/2404.02638v1","updated":"2024-04-03T10:57:47Z","published":"2024-04-03T10:57:47Z","title":"SG-BEV: Satellite-Guided BEV Fusion for Cross-View Semantic Segmentation","summary":" This paper aims at achieving fine-grained building attribute segmentation in\na cross-view scenario, i.e., using satellite and street-view image pairs. The\nmain challenge lies in overcoming the significant perspective differences\nbetween street views and satellite views. In this work, we introduce SG-BEV, a\nnovel approach for satellite-guided BEV fusion for cross-view semantic\nsegmentation. To overcome the limitations of existing cross-view projection\nmethods in capturing the complete building facade features, we innovatively\nincorporate Bird's Eye View (BEV) method to establish a spatially explicit\nmapping of street-view features. Moreover, we fully leverage the advantages of\nmultiple perspectives by introducing a novel satellite-guided reprojection\nmodule, optimizing the uneven feature distribution issues associated with\ntraditional BEV methods. Our method demonstrates significant improvements on\nfour cross-view datasets collected from multiple cities, including New York,\nSan Francisco, and Boston. On average across these datasets, our method\nachieves an increase in mIOU by 10.13% and 5.21% compared with the\nstate-of-the-art satellite-based and cross-view methods. The code and datasets\nof this work will be released at https://github.com/yejy53/SG-BEV.\n","authors":["Junyan Ye","Qiyan Luo","Jinhua Yu","Huaping Zhong","Zhimeng Zheng","Conghui He","Weijia Li"],"pdf_url":"https://arxiv.org/pdf/2404.02638v1.pdf","comment":"accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2311.10224v2","updated":"2024-04-03T10:57:10Z","published":"2023-11-16T22:31:05Z","title":"CV-Attention UNet: Attention-based UNet for 3D Cerebrovascular\n Segmentation of Enhanced TOF-MRA Images","summary":" Due to the lack of automated methods, to diagnose cerebrovascular disease,\ntime-of-flight magnetic resonance angiography (TOF-MRA) is assessed visually,\nmaking it time-consuming. The commonly used encoder-decoder architectures for\ncerebrovascular segmentation utilize redundant features, eventually leading to\nthe extraction of low-level features multiple times. Additionally,\nconvolutional neural networks (CNNs) suffer from performance degradation when\nthe batch size is small, and deeper networks experience the vanishing gradient\nproblem. Methods: In this paper, we attempt to solve these limitations and\npropose the 3D cerebrovascular attention UNet method, named CV-AttentionUNet,\nfor precise extraction of brain vessel images. We proposed a sequence of\npreprocessing techniques followed by deeply supervised UNet to improve the\naccuracy of segmentation of the brain vessels leading to a stroke. To combine\nthe low and high semantics, we applied the attention mechanism. This mechanism\nfocuses on relevant associations and neglects irrelevant anatomical\ninformation. Furthermore, the inclusion of deep supervision incorporates\ndifferent levels of features that prove to be beneficial for network\nconvergence. Results: We demonstrate the efficiency of the proposed method by\ncross-validating with an unlabeled dataset, which was further labeled by us. We\nbelieve that the novelty of this algorithm lies in its ability to perform well\non both labeled and unlabeled data with image processing-based enhancement. The\nresults indicate that our method performed better than the existing\nstate-of-the-art methods on the TubeTK dataset. Conclusion: The proposed method\nwill help in accurate segmentation of cerebrovascular structure leading to\nstroke\n","authors":["Syed Farhan Abbas","Nguyen Thanh Duc","Yoonguu Song","Kyungwon Kim","Ekta Srivastava","Boreom Lee"],"pdf_url":"https://arxiv.org/pdf/2311.10224v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02634v1","updated":"2024-04-03T10:44:06Z","published":"2024-04-03T10:44:06Z","title":"3DStyleGLIP: Part-Tailored Text-Guided 3D Neural Stylization","summary":" 3D stylization, which entails the application of specific styles to\nthree-dimensional objects, holds significant commercial potential as it enables\nthe creation of diverse 3D objects with distinct moods and styles, tailored to\nspecific demands of different scenes. With recent advancements in text-driven\nmethods and artificial intelligence, the stylization process is increasingly\nintuitive and automated, thereby diminishing the reliance on manual labor and\nexpertise. However, existing methods have predominantly focused on holistic\nstylization, thereby leaving the application of styles to individual components\nof a 3D object unexplored. In response, we introduce 3DStyleGLIP, a novel\nframework specifically designed for text-driven, part-tailored 3D stylization.\nGiven a 3D mesh and a text prompt, 3DStyleGLIP leverages the vision-language\nembedding space of the Grounded Language-Image Pre-training (GLIP) model to\nlocalize the individual parts of the 3D mesh and modify their colors and local\ngeometries to align them with the desired styles specified in the text prompt.\n3DStyleGLIP is effectively trained for 3D stylization tasks through a\npart-level style loss working in GLIP's embedding space, supplemented by two\ncomplementary learning techniques. Extensive experimental validation confirms\nthat our method achieves significant part-wise stylization capabilities,\ndemonstrating promising potential in advancing the field of 3D stylization.\n","authors":["SeungJeh Chung","JooHyun Park","Hyewon Kan","HyeongYeop Kang"],"pdf_url":"https://arxiv.org/pdf/2404.02634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.00553v4","updated":"2024-04-03T10:36:31Z","published":"2023-04-02T15:04:43Z","title":"From Isolated Islands to Pangea: Unifying Semantic Space for Human\n Action Understanding","summary":" Action understanding has attracted long-term attention. It can be formed as\nthe mapping from the physical space to the semantic space. Typically,\nresearchers built datasets according to idiosyncratic choices to define classes\nand push the envelope of benchmarks respectively. Datasets are incompatible\nwith each other like \"Isolated Islands\" due to semantic gaps and various class\ngranularities, e.g., do housework in dataset A and wash plate in dataset B. We\nargue that we need a more principled semantic space to concentrate the\ncommunity efforts and use all datasets together to pursue generalizable action\nlearning. To this end, we design a structured action semantic space given verb\ntaxonomy hierarchy and covering massive actions. By aligning the classes of\nprevious datasets to our semantic space, we gather (image/video/skeleton/MoCap)\ndatasets into a unified database in a unified label system, i.e., bridging\n\"isolated islands\" into a \"Pangea\". Accordingly, we propose a novel model\nmapping from the physical space to semantic space to fully use Pangea. In\nextensive experiments, our new system shows significant superiority, especially\nin transfer learning. Our code and data will be made public at\nhttps://mvig-rhos.com/pangea.\n","authors":["Yong-Lu Li","Xiaoqian Wu","Xinpeng Liu","Zehao Wang","Yiming Dou","Yikun Ji","Junyi Zhang","Yixing Li","Jingru Tan","Xudong Lu","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2304.00553v4.pdf","comment":"CVPR 2024, Project Webpage: https://mvig-rhos.com/pangea"},{"id":"http://arxiv.org/abs/2404.02624v1","updated":"2024-04-03T10:25:45Z","published":"2024-04-03T10:25:45Z","title":"Multi-Scale Spatial-Temporal Self-Attention Graph Convolutional Networks\n for Skeleton-based Action Recognition","summary":" Skeleton-based gesture recognition methods have achieved high success using\nGraph Convolutional Network (GCN). In addition, context-dependent adaptive\ntopology as a neighborhood vertex information and attention mechanism leverages\na model to better represent actions. In this paper, we propose self-attention\nGCN hybrid model, Multi-Scale Spatial-Temporal self-attention (MSST)-GCN to\neffectively improve modeling ability to achieve state-of-the-art results on\nseveral datasets. We utilize spatial self-attention module with adaptive\ntopology to understand intra-frame interactions within a frame among different\nbody parts, and temporal self-attention module to examine correlations between\nframes of a node. These two are followed by multi-scale convolution network\nwith dilations, which not only captures the long-range temporal dependencies of\njoints but also the long-range spatial dependencies (i.e., long-distance\ndependencies) of node temporal behaviors. They are combined into high-level\nspatial-temporal representations and output the predicted action with the\nsoftmax classifier.\n","authors":["Ikuo Nakamura"],"pdf_url":"https://arxiv.org/pdf/2404.02624v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2306.05401v3","updated":"2024-04-03T10:16:22Z","published":"2023-06-08T17:52:34Z","title":"RDumb: A simple approach that questions our progress in continual\n test-time adaptation","summary":" Test-Time Adaptation (TTA) allows to update pre-trained models to changing\ndata distributions at deployment time. While early work tested these algorithms\nfor individual fixed distribution shifts, recent work proposed and applied\nmethods for continual adaptation over long timescales. To examine the reported\nprogress in the field, we propose the Continually Changing Corruptions (CCC)\nbenchmark to measure asymptotic performance of TTA techniques. We find that\neventually all but one state-of-the-art methods collapse and perform worse than\na non-adapting model, including models specifically proposed to be robust to\nperformance collapse. In addition, we introduce a simple baseline, \"RDumb\",\nthat periodically resets the model to its pretrained state. RDumb performs\nbetter or on par with the previously proposed state-of-the-art in all\nconsidered benchmarks. Our results show that previous TTA approaches are\nneither effective at regularizing adaptation to avoid collapse nor able to\noutperform a simplistic resetting strategy.\n","authors":["Ori Press","Steffen Schneider","Matthias Kümmerer","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2306.05401v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02618v1","updated":"2024-04-03T10:11:22Z","published":"2024-04-03T10:11:22Z","title":"Diffexplainer: Towards Cross-modal Global Explanations with Diffusion\n Models","summary":" We present DiffExplainer, a novel framework that, leveraging language-vision\nmodels, enables multimodal global explainability. DiffExplainer employs\ndiffusion models conditioned on optimized text prompts, synthesizing images\nthat maximize class outputs and hidden features of a classifier, thus providing\na visual tool for explaining decisions. Moreover, the analysis of generated\nvisual descriptions allows for automatic identification of biases and spurious\nfeatures, as opposed to traditional methods that often rely on manual\nintervention. The cross-modal transferability of language-vision models also\nenables the possibility to describe decisions in a more human-interpretable\nway, i.e., through text. We conduct comprehensive experiments, which include an\nextensive user study, demonstrating the effectiveness of DiffExplainer on 1)\nthe generation of high-quality images explaining model decisions, surpassing\nexisting activation maximization methods, and 2) the automated identification\nof biases and spurious features.\n","authors":["Matteo Pennisi","Giovanni Bellitto","Simone Palazzo","Mubarak Shah","Concetto Spampinato"],"pdf_url":"https://arxiv.org/pdf/2404.02618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02617v1","updated":"2024-04-03T10:08:55Z","published":"2024-04-03T10:08:55Z","title":"Neural Radiance Fields with Torch Units","summary":" Neural Radiance Fields (NeRF) give rise to learning-based 3D reconstruction\nmethods widely used in industrial applications. Although prevalent methods\nachieve considerable improvements in small-scale scenes, accomplishing\nreconstruction in complex and large-scale scenes is still challenging. First,\nthe background in complex scenes shows a large variance among different views.\nSecond, the current inference pattern, $i.e.$, a pixel only relies on an\nindividual camera ray, fails to capture contextual information. To solve these\nproblems, we propose to enlarge the ray perception field and build up the\nsample points interactions. In this paper, we design a novel inference pattern\nthat encourages a single camera ray possessing more contextual information, and\nmodels the relationship among sample points on each camera ray. To hold\ncontextual information,a camera ray in our proposed method can render a patch\nof pixels simultaneously. Moreover, we replace the MLP in neural radiance field\nmodels with distance-aware convolutions to enhance the feature propagation\namong sample points from the same camera ray. To summarize, as a torchlight, a\nray in our proposed method achieves rendering a patch of image. Thus, we call\nthe proposed method, Torch-NeRF. Extensive experiments on KITTI-360 and LLFF\nshow that the Torch-NeRF exhibits excellent performance.\n","authors":["Bingnan Ni","Huanyu Wang","Dongfeng Bai","Minghe Weng","Dexin Qi","Weichao Qiu","Bingbing Liu"],"pdf_url":"https://arxiv.org/pdf/2404.02617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02614v1","updated":"2024-04-03T10:01:23Z","published":"2024-04-03T10:01:23Z","title":"Vestibular schwannoma growth_prediction from longitudinal MRI by time\n conditioned neural fields","summary":" Vestibular schwannomas (VS) are benign tumors that are generally managed by\nactive surveillance with MRI examination. To further assist clinical\ndecision-making and avoid overtreatment, an accurate prediction of tumor growth\nbased on longitudinal imaging is highly desirable. In this paper, we introduce\nDeepGrowth, a deep learning method that incorporates neural fields and\nrecurrent neural networks for prospective tumor growth prediction. In the\nproposed method, each tumor is represented as a signed distance function (SDF)\nconditioned on a low-dimensional latent code. Unlike previous studies that\nperform tumor shape prediction directly in the image space, we predict the\nlatent codes instead and then reconstruct future shapes from it. To deal with\nirregular time intervals, we introduce a time-conditioned recurrent module\nbased on a ConvLSTM and a novel temporal encoding strategy, which enables the\nproposed model to output varying tumor shapes over time. The experiments on an\nin-house longitudinal VS dataset showed that the proposed model significantly\nimproved the performance ($\\ge 1.6\\%$ Dice score and $\\ge0.20$ mm 95\\%\nHausdorff distance), in particular for top 20\\% tumors that grow or shrink the\nmost ($\\ge 4.6\\%$ Dice score and $\\ge 0.73$ mm 95\\% Hausdorff distance). Our\ncode is available at ~\\burl{https://github.com/cyjdswx/DeepGrowth}\n","authors":["Yunjie Chen","Jelmer M. Wolterink","Olaf M. Neve","Stephan R. Romeijn","Berit M. Verbist","Erik F. Hensen","Qian Tao","Marius Staring"],"pdf_url":"https://arxiv.org/pdf/2404.02614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07739v2","updated":"2024-04-03T09:50:54Z","published":"2024-02-12T15:57:31Z","title":"Task-conditioned adaptation of visual features in multi-task policy\n learning","summary":" Successfully addressing a wide variety of tasks is a core ability of\nautonomous agents, requiring flexibly adapting the underlying decision-making\nstrategies and, as we argue in this work, also adapting the perception modules.\nAn analogical argument would be the human visual system, which uses top-down\nsignals to focus attention determined by the current task. Similarly, we adapt\npre-trained large vision models conditioned on specific downstream tasks in the\ncontext of multi-task policy learning. We introduce task-conditioned adapters\nthat do not require finetuning any pre-trained weights, combined with a single\npolicy trained with behavior cloning and capable of addressing multiple tasks.\nWe condition the visual adapters on task embeddings, which can be selected at\ninference if the task is known, or alternatively inferred from a set of example\ndemonstrations. To this end, we propose a new optimization-based estimator. We\nevaluate the method on a wide variety of tasks from the CortexBench benchmark\nand show that, compared to existing work, it can be addressed with a single\npolicy. In particular, we demonstrate that adapting visual features is a key\ndesign choice and that the method generalizes to unseen tasks given a few\ndemonstrations.\n","authors":["Pierre Marza","Laetitia Matignon","Olivier Simonin","Christian Wolf"],"pdf_url":"https://arxiv.org/pdf/2402.07739v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00035v2","updated":"2024-04-03T09:28:43Z","published":"2024-01-08T12:19:46Z","title":"Robustness Assessment of a Runway Object Classifier for Safe Aircraft\n Taxiing","summary":" As deep neural networks (DNNs) are becoming the prominent solution for many\ncomputational problems, the aviation industry seeks to explore their potential\nin alleviating pilot workload and in improving operational safety. However, the\nuse of DNNs in this type of safety-critical applications requires a thorough\ncertification process. This need can be addressed through formal verification,\nwhich provides rigorous assurances -- e.g.,~by proving the absence of certain\nmispredictions. In this case-study paper, we demonstrate this process using an\nimage-classifier DNN currently under development at Airbus and intended for use\nduring the aircraft taxiing phase. We use formal methods to assess this DNN's\nrobustness to three common image perturbation types: noise, brightness and\ncontrast, and some of their combinations. This process entails multiple\ninvocations of the underlying verifier, which might be computationally\nexpensive; and we therefore propose a method that leverages the monotonicity of\nthese robustness properties, as well as the results of past verification\nqueries, in order to reduce the overall number of verification queries required\nby nearly 60%. Our results provide an indication of the level of robustness\nachieved by the DNN classifier under study, and indicate that it is\nconsiderably more vulnerable to noise than to brightness or contrast\nperturbations.\n","authors":["Yizhak Elboher","Raya Elsaleh","Omri Isac","Mélanie Ducoffe","Audrey Galametz","Guillaume Povéda","Ryma Boumazouza","Noémie Cohen","Guy Katz"],"pdf_url":"https://arxiv.org/pdf/2402.00035v2.pdf","comment":"This is a preprint version of the paper in the proceedings of 43rd\n Digital Avionics Systems Conference (DASC)"},{"id":"http://arxiv.org/abs/2403.05839v2","updated":"2024-04-03T09:25:34Z","published":"2024-03-09T08:49:50Z","title":"Long-term Frame-Event Visual Tracking: Benchmark Dataset and Baseline","summary":" Current event-/frame-event based trackers undergo evaluation on short-term\ntracking datasets, however, the tracking of real-world scenarios involves\nlong-term tracking, and the performance of existing tracking algorithms in\nthese scenarios remains unclear. In this paper, we first propose a new\nlong-term and large-scale frame-event single object tracking dataset, termed\nFELT. It contains 742 videos and 1,594,474 RGB frames and event stream pairs\nand has become the largest frame-event tracking dataset to date. We re-train\nand evaluate 15 baseline trackers on our dataset for future works to compare.\nMore importantly, we find that the RGB frames and event streams are naturally\nincomplete due to the influence of challenging factors and spatially sparse\nevent flow. In response to this, we propose a novel associative memory\nTransformer network as a unified backbone by introducing modern Hopfield layers\ninto multi-head self-attention blocks to fuse both RGB and event data.\nExtensive experiments on RGB-Event (FELT), RGB-Thermal (RGBT234, LasHeR), and\nRGB-Depth (DepthTrack) datasets fully validated the effectiveness of our model.\nThe dataset and source code can be found at\n\\url{https://github.com/Event-AHU/FELT_SOT_Benchmark}.\n","authors":["Xiao Wang","Ju Huang","Shiao Wang","Chuanming Tang","Bo Jiang","Yonghong Tian","Jin Tang","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2403.05839v2.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2401.04647v2","updated":"2024-04-03T09:25:08Z","published":"2024-01-09T16:16:16Z","title":"Advancing Ante-Hoc Explainable Models through Generative Adversarial\n Networks","summary":" This paper presents a novel concept learning framework for enhancing model\ninterpretability and performance in visual classification tasks. Our approach\nappends an unsupervised explanation generator to the primary classifier network\nand makes use of adversarial training. During training, the explanation module\nis optimized to extract visual concepts from the classifier's latent\nrepresentations, while the GAN-based module aims to discriminate images\ngenerated from concepts, from true images. This joint training scheme enables\nthe model to implicitly align its internally learned concepts with\nhuman-interpretable visual properties. Comprehensive experiments demonstrate\nthe robustness of our approach, while producing coherent concept activations.\nWe analyse the learned concepts, showing their semantic concordance with object\nparts and visual attributes. We also study how perturbations in the adversarial\ntraining protocol impact both classification and concept acquisition. In\nsummary, this work presents a significant step towards building inherently\ninterpretable deep vision models with task-aligned concept representations - a\nkey enabler for developing trustworthy AI for real-world perception tasks.\n","authors":["Tanmay Garg","Deepika Vemuri","Vineeth N Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2401.04647v2.pdf","comment":"Paper accepted in Human-Centric Representation Learning workshop at\n AAAI 2024 (https://hcrl-workshop.github.io/2024/). Paper accepted and\n presented at Deployable AI Workshop at AAAI-2024\n (https://sites.google.com/view/dai-2024/home)"},{"id":"http://arxiv.org/abs/2404.01889v2","updated":"2024-04-03T09:18:09Z","published":"2024-04-02T12:28:40Z","title":"RAVE: Residual Vector Embedding for CLIP-Guided Backlit Image\n Enhancement","summary":" In this paper we propose a novel modification of Contrastive Language-Image\nPre-Training (CLIP) guidance for the task of unsupervised backlit image\nenhancement. Our work builds on the state-of-the-art CLIP-LIT approach, which\nlearns a prompt pair by constraining the text-image similarity between a prompt\n(negative/positive sample) and a corresponding image (backlit image/well-lit\nimage) in the CLIP embedding space. Learned prompts then guide an image\nenhancement network. Based on the CLIP-LIT framework, we propose two novel\nmethods for CLIP guidance. First, we show that instead of tuning prompts in the\nspace of text embeddings, it is possible to directly tune their embeddings in\nthe latent space without any loss in quality. This accelerates training and\npotentially enables the use of additional encoders that do not have a text\nencoder. Second, we propose a novel approach that does not require any prompt\ntuning. Instead, based on CLIP embeddings of backlit and well-lit images from\ntraining data, we compute the residual vector in the embedding space as a\nsimple difference between the mean embeddings of the well-lit and backlit\nimages. This vector then guides the enhancement network during training,\npushing a backlit image towards the space of well-lit images. This approach\nfurther dramatically reduces training time, stabilizes training and produces\nhigh quality enhanced images without artifacts, both in supervised and\nunsupervised training regimes. Additionally, we show that residual vectors can\nbe interpreted, revealing biases in training data, and thereby enabling\npotential bias correction.\n","authors":["Tatiana Gaintseva","Martin Benning","Gregory Slabaugh"],"pdf_url":"https://arxiv.org/pdf/2404.01889v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02585v1","updated":"2024-04-03T09:09:42Z","published":"2024-04-03T09:09:42Z","title":"Unsegment Anything by Simulating Deformation","summary":" Foundation segmentation models, while powerful, pose a significant risk: they\nenable users to effortlessly extract any objects from any digital content with\na single click, potentially leading to copyright infringement or malicious\nmisuse. To mitigate this risk, we introduce a new task \"Anything Unsegmentable\"\nto grant any image \"the right to be unsegmented\". The ambitious pursuit of the\ntask is to achieve highly transferable adversarial attacks against all\nprompt-based segmentation models, regardless of model parameterizations and\nprompts. We highlight the non-transferable and heterogeneous nature of\nprompt-specific adversarial noises. Our approach focuses on disrupting image\nencoder features to achieve prompt-agnostic attacks. Intriguingly, targeted\nfeature attacks exhibit better transferability compared to untargeted ones,\nsuggesting the optimal update direction aligns with the image manifold. Based\non the observations, we design a novel attack named Unsegment Anything by\nSimulating Deformation (UAD). Our attack optimizes a differentiable deformation\nfunction to create a target deformed image, which alters structural information\nwhile preserving achievable feature distance by adversarial example. Extensive\nexperiments verify the effectiveness of our approach, compromising a variety of\npromptable segmentation models with different architectures and prompt\ninterfaces. We release the code at\nhttps://github.com/jiahaolu97/anything-unsegmentable.\n","authors":["Jiahao Lu","Xingyi Yang","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02585v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2306.13674v3","updated":"2024-04-03T08:56:01Z","published":"2023-06-19T09:47:33Z","title":"MeciFace: Mechanomyography and Inertial Fusion-based Glasses for Edge\n Real-Time Recognition of Facial and Eating Activities","summary":" The increasing prevalence of stress-related eating behaviors and their impact\non overall health highlights the importance of effective and ubiquitous\nmonitoring systems. In this paper, we present MeciFace, an innovative wearable\ntechnology designed to monitor facial expressions and eating activities in\nreal-time on-the-edge (RTE). MeciFace aims to provide a low-power,\nprivacy-conscious, and highly accurate tool for promoting healthy eating\nbehaviors and stress management. We employ lightweight convolutional neural\nnetworks as backbone models for facial expression and eating monitoring\nscenarios. The MeciFace system ensures efficient data processing with a tiny\nmemory footprint, ranging from 11KB to 19 KB. During RTE evaluation, the system\nachieves an F1-score of < 86% for facial expression recognition and 94% for\neating/drinking monitoring, for the RTE of unseen users (user-independent\ncase).\n","authors":["Hymalai Bello","Sungho Suh","Bo Zhou","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2306.13674v3.pdf","comment":"Submitted to IEEE Transactions on Consumer Electronics"},{"id":"http://arxiv.org/abs/2404.02580v1","updated":"2024-04-03T08:55:44Z","published":"2024-04-03T08:55:44Z","title":"Active learning for efficient annotation in precision agriculture: a\n use-case on crop-weed semantic segmentation","summary":" Optimizing deep learning models requires large amounts of annotated images, a\nprocess that is both time-intensive and costly. Especially for semantic\nsegmentation models in which every pixel must be annotated. A potential\nstrategy to mitigate annotation effort is active learning. Active learning\nfacilitates the identification and selection of the most informative images\nfrom a large unlabelled pool. The underlying premise is that these selected\nimages can improve the model's performance faster than random selection to\nreduce annotation effort. While active learning has demonstrated promising\nresults on benchmark datasets like Cityscapes, its performance in the\nagricultural domain remains largely unexplored. This study addresses this\nresearch gap by conducting a comparative study of three active learning-based\nacquisition functions: Bayesian Active Learning by Disagreement (BALD),\nstochastic-based BALD (PowerBALD), and Random. The acquisition functions were\ntested on two agricultural datasets: Sugarbeet and Corn-Weed, both containing\nthree semantic classes: background, crop and weed. Our results indicated that\nactive learning, especially PowerBALD, yields a higher performance than Random\nsampling on both datasets. But due to the relatively large standard deviations,\nthe differences observed were minimal; this was partly caused by high image\nredundancy and imbalanced classes. Specifically, more than 89\\% of the pixels\nbelonged to the background class on both datasets. The absence of significant\nresults on both datasets indicates that further research is required for\napplying active learning on agricultural datasets, especially if they contain a\nhigh-class imbalance and redundant images. Recommendations and insights are\nprovided in this paper to potentially resolve such issues.\n","authors":["Bart M. van Marrewijk","Charbel Dandjinou","Dan Jeric Arcega Rustia","Nicolas Franco Gonzalez","Boubacar Diallo","Jérôme Dias","Paul Melki","Pieter M. Blok"],"pdf_url":"https://arxiv.org/pdf/2404.02580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02573v1","updated":"2024-04-03T08:47:40Z","published":"2024-04-03T08:47:40Z","title":"Knowledge Distillation with Multi-granularity Mixture of Priors for\n Image Super-Resolution","summary":" Knowledge distillation (KD) is a promising yet challenging model compression\ntechnique that transfers rich learning representations from a well-performing\nbut cumbersome teacher model to a compact student model. Previous methods for\nimage super-resolution (SR) mostly compare the feature maps directly or after\nstandardizing the dimensions with basic algebraic operations (e.g. average,\ndot-product). However, the intrinsic semantic differences among feature maps\nare overlooked, which are caused by the disparate expressive capacity between\nthe networks. This work presents MiPKD, a multi-granularity mixture of prior KD\nframework, to facilitate efficient SR model through the feature mixture in a\nunified latent space and stochastic network block mixture. Extensive\nexperiments demonstrate the effectiveness of the proposed MiPKD method.\n","authors":["Simiao Li","Yun Zhang","Wei Li","Hanting Chen","Wenjia Wang","Bingyi Jing","Shaohui Lin","Jie Hu"],"pdf_url":"https://arxiv.org/pdf/2404.02573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02562v1","updated":"2024-04-03T08:33:08Z","published":"2024-04-03T08:33:08Z","title":"Representation Alignment Contrastive Regularization for Multi-Object\n Tracking","summary":" Achieving high-performance in multi-object tracking algorithms heavily relies\non modeling spatio-temporal relationships during the data association stage.\nMainstream approaches encompass rule-based and deep learning-based methods for\nspatio-temporal relationship modeling. While the former relies on physical\nmotion laws, offering wider applicability but yielding suboptimal results for\ncomplex object movements, the latter, though achieving high-performance, lacks\ninterpretability and involves complex module designs. This work aims to\nsimplify deep learning-based spatio-temporal relationship models and introduce\ninterpretability into features for data association. Specifically, a\nlightweight single-layer transformer encoder is utilized to model\nspatio-temporal relationships. To make features more interpretative, two\ncontrastive regularization losses based on representation alignment are\nproposed, derived from spatio-temporal consistency rules. By applying weighted\nsummation to affinity matrices, the aligned features can seamlessly integrate\ninto the data association stage of the original tracking workflow. Experimental\nresults showcase that our model enhances the majority of existing tracking\nnetworks' performance without excessive complexity, with minimal increase in\ntraining overhead and nearly negligible computational and storage costs.\n","authors":["Shujie Chen","Zhonglin Liu","Jianfeng Dong","Di Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.02562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02558v1","updated":"2024-04-03T08:27:24Z","published":"2024-04-03T08:27:24Z","title":"Regional biases in image geolocation estimation: a case study with the\n SenseCity Africa dataset","summary":" Advances in Artificial Intelligence are challenged by the biases rooted in\nthe datasets used to train the models. In image geolocation estimation, models\nare mostly trained using data from specific geographic regions, notably the\nWestern world, and as a result, they may struggle to comprehend the\ncomplexities of underrepresented regions. To assess this issue, we apply a\nstate-of-the-art image geolocation estimation model (ISNs) to a crowd-sourced\ndataset of geolocated images from the African continent (SCA100), and then\nexplore the regional and socioeconomic biases underlying the model's\npredictions. Our findings show that the ISNs model tends to over-predict image\nlocations in high-income countries of the Western world, which is consistent\nwith the geographic distribution of its training data, i.e., the IM2GPS3k\ndataset. Accordingly, when compared to the IM2GPS3k benchmark, the accuracy of\nthe ISNs model notably decreases at all scales. Additionally, we cluster images\nof the SCA100 dataset based on how accurately they are predicted by the ISNs\nmodel and show the model's difficulties in correctly predicting the locations\nof images in low income regions, especially in Sub-Saharan Africa. Therefore,\nour results suggest that using IM2GPS3k as a training set and benchmark for\nimage geolocation estimation and other computer vision models overlooks its\npotential application in the African context.\n","authors":["Ximena Salgado Uribe","Martí Bosch","Jérôme Chenal"],"pdf_url":"https://arxiv.org/pdf/2404.02558v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.01272v2","updated":"2024-04-03T08:27:01Z","published":"2024-04-01T17:48:15Z","title":"Language Guided Domain Generalized Medical Image Segmentation","summary":" Single source domain generalization (SDG) holds promise for more reliable and\nconsistent image segmentation across real-world clinical settings particularly\nin the medical domain, where data privacy and acquisition cost constraints\noften limit the availability of diverse datasets. Depending solely on visual\nfeatures hampers the model's capacity to adapt effectively to various domains,\nprimarily because of the presence of spurious correlations and domain-specific\ncharacteristics embedded within the image features. Incorporating text features\nalongside visual features is a potential solution to enhance the model's\nunderstanding of the data, as it goes beyond pixel-level information to provide\nvaluable context. Textual cues describing the anatomical structures, their\nappearances, and variations across various imaging modalities can guide the\nmodel in domain adaptation, ultimately contributing to more robust and\nconsistent segmentation. In this paper, we propose an approach that explicitly\nleverages textual information by incorporating a contrastive learning mechanism\nguided by the text encoder features to learn a more robust feature\nrepresentation. We assess the effectiveness of our text-guided contrastive\nfeature alignment technique in various scenarios, including cross-modality,\ncross-sequence, and cross-site settings for different segmentation tasks. Our\napproach achieves favorable performance against existing methods in literature.\nOur code and model weights are available at\nhttps://github.com/ShahinaKK/LG_SDG.git.\n","authors":["Shahina Kunhimon","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2404.01272v2.pdf","comment":"Accepted at ISBI2024"},{"id":"http://arxiv.org/abs/2401.13627v2","updated":"2024-04-03T08:12:08Z","published":"2024-01-24T17:58:07Z","title":"Scaling Up to Excellence: Practicing Model Scaling for Photo-Realistic\n Image Restoration In the Wild","summary":" We introduce SUPIR (Scaling-UP Image Restoration), a groundbreaking image\nrestoration method that harnesses generative prior and the power of model\nscaling up. Leveraging multi-modal techniques and advanced generative prior,\nSUPIR marks a significant advance in intelligent and realistic image\nrestoration. As a pivotal catalyst within SUPIR, model scaling dramatically\nenhances its capabilities and demonstrates new potential for image restoration.\nWe collect a dataset comprising 20 million high-resolution, high-quality images\nfor model training, each enriched with descriptive text annotations. SUPIR\nprovides the capability to restore images guided by textual prompts, broadening\nits application scope and potential. Moreover, we introduce negative-quality\nprompts to further improve perceptual quality. We also develop a\nrestoration-guided sampling method to suppress the fidelity issue encountered\nin generative-based restoration. Experiments demonstrate SUPIR's exceptional\nrestoration effects and its novel capacity to manipulate restoration through\ntextual prompts.\n","authors":["Fanghua Yu","Jinjin Gu","Zheyuan Li","Jinfan Hu","Xiangtao Kong","Xintao Wang","Jingwen He","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2401.13627v2.pdf","comment":"This paper has been accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2307.13981v2","updated":"2024-04-03T08:04:55Z","published":"2023-07-26T06:38:33Z","title":"Analysis of Video Quality Datasets via Design of Minimalistic Video\n Quality Models","summary":" Blind video quality assessment (BVQA) plays an indispensable role in\nmonitoring and improving the end-users' viewing experience in various\nreal-world video-enabled media applications. As an experimental field, the\nimprovements of BVQA models have been measured primarily on a few human-rated\nVQA datasets. Thus, it is crucial to gain a better understanding of existing\nVQA datasets in order to properly evaluate the current progress in BVQA.\nTowards this goal, we conduct a first-of-its-kind computational analysis of VQA\ndatasets via designing minimalistic BVQA models. By minimalistic, we restrict\nour family of BVQA models to build only upon basic blocks: a video preprocessor\n(for aggressive spatiotemporal downsampling), a spatial quality analyzer, an\noptional temporal quality analyzer, and a quality regressor, all with the\nsimplest possible instantiations. By comparing the quality prediction\nperformance of different model variants on eight VQA datasets with realistic\ndistortions, we find that nearly all datasets suffer from the easy dataset\nproblem of varying severity, some of which even admit blind image quality\nassessment (BIQA) solutions. We additionally justify our claims by contrasting\nour model generalizability on these VQA datasets, and by ablating a dizzying\nset of BVQA design choices related to the basic building blocks. Our results\ncast doubt on the current progress in BVQA, and meanwhile shed light on good\npractices of constructing next-generation VQA datasets and models.\n","authors":["Wei Sun","Wen Wen","Xiongkuo Min","Long Lan","Guangtao Zhai","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2307.13981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02544v1","updated":"2024-04-03T08:01:00Z","published":"2024-04-03T08:01:00Z","title":"Semi-Supervised Unconstrained Head Pose Estimation in the Wild","summary":" Existing head pose estimation datasets are either composed of numerous\nsamples by non-realistic synthesis or lab collection, or limited images by\nlabor-intensive annotating. This makes deep supervised learning based solutions\ncompromised due to the reliance on generous labeled data. To alleviate it, we\npropose the first semi-supervised unconstrained head pose estimation (SemiUHPE)\nmethod, which can leverage a large amount of unlabeled wild head images.\nSpecifically, we follow the recent semi-supervised rotation regression, and\nfocus on the diverse and complex head pose domain. Firstly, we claim that the\naspect-ratio invariant cropping of heads is superior to the previous\nlandmark-based affine alignment, which does not fit unlabeled natural heads or\npractical applications where landmarks are often unavailable. Then, instead of\nusing an empirically fixed threshold to filter out pseudo labels, we propose\nthe dynamic entropy-based filtering by updating thresholds for adaptively\nremoving unlabeled outliers. Moreover, we revisit the design of weak-strong\naugmentations, and further exploit its superiority by devising two novel\nhead-oriented strong augmentations named pose-irrelevant cut-occlusion and\npose-altering rotation consistency. Extensive experiments show that SemiUHPE\ncan surpass SOTAs with remarkable improvements on public benchmarks under both\nfront-range and full-range. Our code is released in\n\\url{https://github.com/hnuzhy/SemiUHPE}.\n","authors":["Huayi Zhou","Fei Jiang","Hongtao Lu"],"pdf_url":"https://arxiv.org/pdf/2404.02544v1.pdf","comment":"14 pages. Semi-Supervised Unconstrained Head Pose Estimation"},{"id":"http://arxiv.org/abs/2403.19425v2","updated":"2024-04-03T07:37:32Z","published":"2024-03-28T13:56:26Z","title":"A Robust Ensemble Algorithm for Ischemic Stroke Lesion Segmentation:\n Generalizability and Clinical Utility Beyond the ISLES Challenge","summary":" Diffusion-weighted MRI (DWI) is essential for stroke diagnosis, treatment\ndecisions, and prognosis. However, image and disease variability hinder the\ndevelopment of generalizable AI algorithms with clinical value. We address this\ngap by presenting a novel ensemble algorithm derived from the 2022 Ischemic\nStroke Lesion Segmentation (ISLES) challenge. ISLES'22 provided 400 patient\nscans with ischemic stroke from various medical centers, facilitating the\ndevelopment of a wide range of cutting-edge segmentation algorithms by the\nresearch community. Through collaboration with leading teams, we combined\ntop-performing algorithms into an ensemble model that overcomes the limitations\nof individual solutions. Our ensemble model achieved superior ischemic lesion\ndetection and segmentation accuracy on our internal test set compared to\nindividual algorithms. This accuracy generalized well across diverse image and\ndisease variables. Furthermore, the model excelled in extracting clinical\nbiomarkers. Notably, in a Turing-like test, neuroradiologists consistently\npreferred the algorithm's segmentations over manual expert efforts,\nhighlighting increased comprehensiveness and precision. Validation using a\nreal-world external dataset (N=1686) confirmed the model's generalizability.\nThe algorithm's outputs also demonstrated strong correlations with clinical\nscores (admission NIHSS and 90-day mRS) on par with or exceeding expert-derived\nresults, underlining its clinical relevance. This study offers two key\nfindings. First, we present an ensemble algorithm\n(https://github.com/Tabrisrei/ISLES22_Ensemble) that detects and segments\nischemic stroke lesions on DWI across diverse scenarios on par with expert\n(neuro)radiologists. Second, we show the potential for biomedical challenge\noutputs to extend beyond the challenge's initial objectives, demonstrating\ntheir real-world clinical applicability.\n","authors":["Ezequiel de la Rosa","Mauricio Reyes","Sook-Lei Liew","Alexandre Hutton","Roland Wiest","Johannes Kaesmacher","Uta Hanning","Arsany Hakim","Richard Zubal","Waldo Valenzuela","David Robben","Diana M. Sima","Vincenzo Anania","Arne Brys","James A. Meakin","Anne Mickan","Gabriel Broocks","Christian Heitkamp","Shengbo Gao","Kongming Liang","Ziji Zhang","Md Mahfuzur Rahman Siddiquee","Andriy Myronenko","Pooya Ashtari","Sabine Van Huffel","Hyun-su Jeong","Chi-ho Yoon","Chulhong Kim","Jiayu Huo","Sebastien Ourselin","Rachel Sparks","Albert Clèrigues","Arnau Oliver","Xavier Lladó","Liam Chalcroft","Ioannis Pappas","Jeroen Bertels","Ewout Heylen","Juliette Moreau","Nima Hatami","Carole Frindel","Abdul Qayyum","Moona Mazher","Domenec Puig","Shao-Chieh Lin","Chun-Jung Juan","Tianxi Hu","Lyndon Boone","Maged Goubran","Yi-Jui Liu","Susanne Wegener","Florian Kofler","Ivan Ezhov","Suprosanna Shit","Moritz R. Hernandez Petzsche","Bjoern Menze","Jan S. Kirschke","Benedikt Wiestler"],"pdf_url":"https://arxiv.org/pdf/2403.19425v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02530v1","updated":"2024-04-03T07:33:30Z","published":"2024-04-03T07:33:30Z","title":"Severity Controlled Text-to-Image Generative Model Bias Manipulation","summary":" Text-to-image (T2I) generative models are gaining wide popularity, especially\nin public domains. However, their intrinsic bias and potential malicious\nmanipulations remain under-explored. Charting the susceptibility of T2I models\nto such manipulation, we first expose the new possibility of a dynamic and\ncomputationally efficient exploitation of model bias by targeting the embedded\nlanguage models. By leveraging mathematical foundations of vector algebra, our\ntechnique enables a scalable and convenient control over the severity of output\nmanipulation through model bias. As a by-product, this control also allows a\nform of precise prompt engineering to generate images which are generally\nimplausible with regular text prompts. We also demonstrate a constructive\napplication of our manipulation for balancing the frequency of generated\nclasses - as in model debiasing. Our technique does not require training and is\nalso framed as a backdoor attack with severity control using semantically-null\ntext triggers in the prompts. With extensive analysis, we present interesting\nqualitative and quantitative results to expose potential manipulation\npossibilities for T2I models.\n Key-words: Text-to-Image Models, Generative Models, Backdoor Attacks, Prompt\nEngineering, Bias\n","authors":["Jordan Vice","Naveed Akhtar","Richard Hartley","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2404.02530v1.pdf","comment":"This research was supported by National Intelligence and Security\n Discovery Research Grants (project# NS220100007), funded by the Department of\n Defence Australia"},{"id":"http://arxiv.org/abs/2404.02527v1","updated":"2024-04-03T07:30:09Z","published":"2024-04-03T07:30:09Z","title":"Weakly-Supervised 3D Scene Graph Generation via Visual-Linguistic\n Assisted Pseudo-labeling","summary":" Learning to build 3D scene graphs is essential for real-world perception in a\nstructured and rich fashion. However, previous 3D scene graph generation\nmethods utilize a fully supervised learning manner and require a large amount\nof entity-level annotation data of objects and relations, which is extremely\nresource-consuming and tedious to obtain. To tackle this problem, we propose\n3D-VLAP, a weakly-supervised 3D scene graph generation method via\nVisual-Linguistic Assisted Pseudo-labeling. Specifically, our 3D-VLAP exploits\nthe superior ability of current large-scale visual-linguistic models to align\nthe semantics between texts and 2D images, as well as the naturally existing\ncorrespondences between 2D images and 3D point clouds, and thus implicitly\nconstructs correspondences between texts and 3D point clouds. First, we\nestablish the positional correspondence from 3D point clouds to 2D images via\ncamera intrinsic and extrinsic parameters, thereby achieving alignment of 3D\npoint clouds and 2D images. Subsequently, a large-scale cross-modal\nvisual-linguistic model is employed to indirectly align 3D instances with the\ntextual category labels of objects by matching 2D images with object category\nlabels. The pseudo labels for objects and relations are then produced for\n3D-VLAP model training by calculating the similarity between visual embeddings\nand textual category embeddings of objects and relations encoded by the\nvisual-linguistic model, respectively. Ultimately, we design an edge\nself-attention based graph neural network to generate scene graphs of 3D point\ncloud scenes. Extensive experiments demonstrate that our 3D-VLAP achieves\ncomparable results with current advanced fully supervised methods, meanwhile\nsignificantly alleviating the pressure of data annotation.\n","authors":["Xu Wang","Yifan Li","Qiudan Zhang","Wenhui Wu","Mark Junjie Li","Jianmin Jinag"],"pdf_url":"https://arxiv.org/pdf/2404.02527v1.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.02523v1","updated":"2024-04-03T07:23:03Z","published":"2024-04-03T07:23:03Z","title":"Text-driven Affordance Learning from Egocentric Vision","summary":" Visual affordance learning is a key component for robots to understand how to\ninteract with objects. Conventional approaches in this field rely on\npre-defined objects and actions, falling short of capturing diverse\ninteractions in realworld scenarios. The key idea of our approach is employing\ntextual instruction, targeting various affordances for a wide range of objects.\nThis approach covers both hand-object and tool-object interactions. We\nintroduce text-driven affordance learning, aiming to learn contact points and\nmanipulation trajectories from an egocentric view following textual\ninstruction. In our task, contact points are represented as heatmaps, and the\nmanipulation trajectory as sequences of coordinates that incorporate both\nlinear and rotational movements for various manipulations. However, when we\ngather data for this task, manual annotations of these diverse interactions are\ncostly. To this end, we propose a pseudo dataset creation pipeline and build a\nlarge pseudo-training dataset: TextAFF80K, consisting of over 80K instances of\nthe contact points, trajectories, images, and text tuples. We extend existing\nreferring expression comprehension models for our task, and experimental\nresults show that our approach robustly handles multiple affordances, serving\nas a new standard for affordance learning in real-world scenarios.\n","authors":["Tomoya Yoshida","Shuhei Kurita","Taichi Nishimura","Shinsuke Mori"],"pdf_url":"https://arxiv.org/pdf/2404.02523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00611v2","updated":"2024-04-03T07:18:11Z","published":"2024-03-31T09:01:17Z","title":"Object-level Copy-Move Forgery Image Detection based on Inconsistency\n Mining","summary":" In copy-move tampering operations, perpetrators often employ techniques, such\nas blurring, to conceal tampering traces, posing significant challenges to the\ndetection of object-level targets with intact structures. Focus on these\nchallenges, this paper proposes an Object-level Copy-Move Forgery Image\nDetection based on Inconsistency Mining (IMNet). To obtain complete\nobject-level targets, we customize prototypes for both the source and tampered\nregions and dynamically update them. Additionally, we extract inconsistent\nregions between coarse similar regions obtained through self-correlation\ncalculations and regions composed of prototypes. The detected inconsistent\nregions are used as supplements to coarse similar regions to refine pixel-level\ndetection. We operate experiments on three public datasets which validate the\neffectiveness and the robustness of the proposed IMNet.\n","authors":["Jingyu Wang","Niantai Jing","Ziyao Liu","Jie Nie","Yuxin Qi","Chi-Hung Chi","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2404.00611v2.pdf","comment":"4 pages, 2 figures, Accepted to WWW 2024"},{"id":"http://arxiv.org/abs/2404.00228v3","updated":"2024-04-03T07:15:05Z","published":"2024-03-30T03:16:37Z","title":"InfLoRA: Interference-Free Low-Rank Adaptation for Continual Learning","summary":" Continual learning requires the model to learn multiple tasks sequentially.\nIn continual learning, the model should possess the ability to maintain its\nperformance on old tasks (stability) and the ability to adapt to new tasks\ncontinuously (plasticity). Recently, parameter-efficient fine-tuning (PEFT),\nwhich involves freezing a pre-trained model and injecting a small number of\nlearnable parameters to adapt to downstream tasks, has gained increasing\npopularity in continual learning. Although existing continual learning methods\nbased on PEFT have demonstrated superior performance compared to those not\nbased on PEFT, most of them do not consider how to eliminate the interference\nof the new task on the old tasks, which inhibits the model from making a good\ntrade-off between stability and plasticity. In this work, we propose a new PEFT\nmethod, called interference-free low-rank adaptation (InfLoRA), for continual\nlearning. InfLoRA injects a small number of parameters to reparameterize the\npre-trained weights and shows that fine-tuning these injected parameters is\nequivalent to fine-tuning the pre-trained weights within a subspace.\nFurthermore, InfLoRA designs this subspace to eliminate the interference of the\nnew task on the old tasks, making a good trade-off between stability and\nplasticity. Experimental results show that InfLoRA outperforms existing\nstate-of-the-art continual learning methods on multiple datasets.\n","authors":["Yan-Shuo Liang","Wu-Jun Li"],"pdf_url":"https://arxiv.org/pdf/2404.00228v3.pdf","comment":"Accepted by the 2024 IEEE/CVF Conference on Computer Vision and\n Pattern Recognition (CVPR 2024)"},{"id":"http://arxiv.org/abs/2404.02518v1","updated":"2024-04-03T07:11:19Z","published":"2024-04-03T07:11:19Z","title":"CPAISD: Core-penumbra acute ischemic stroke dataset","summary":" We introduce the CPAISD: Core-Penumbra Acute Ischemic Stroke Dataset, aimed\nat enhancing the early detection and segmentation of ischemic stroke using\nNon-Contrast Computed Tomography (NCCT) scans. Addressing the challenges in\ndiagnosing acute ischemic stroke during its early stages due to often\nnon-revealing native CT findings, the dataset provides a collection of\nsegmented NCCT images. These include annotations of ischemic core and penumbra\nregions, critical for developing machine learning models for rapid stroke\nidentification and assessment. By offering a carefully collected and annotated\ndataset, we aim to facilitate the development of advanced diagnostic tools,\ncontributing to improved patient care and outcomes in stroke management. Our\ndataset's uniqueness lies in its focus on the acute phase of ischemic stroke,\nwith non-informative native CT scans, and includes a baseline model to\ndemonstrate the dataset's application, encouraging further research and\ninnovation in the field of medical imaging and stroke diagnosis.\n","authors":["D. Umerenkov","S. Kudin","M. Peksheva","D. Pavlov"],"pdf_url":"https://arxiv.org/pdf/2404.02518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03166v3","updated":"2024-04-03T07:10:22Z","published":"2024-02-05T16:35:29Z","title":"RRWNet: Recursive Refinement Network for Effective Retinal Artery/Vein\n Segmentation and Classification","summary":" The caliber and configuration of retinal blood vessels serve as important\nbiomarkers for various diseases and medical conditions. A thorough analysis of\nthe retinal vasculature requires the segmentation of the blood vessels and\ntheir classification into arteries and veins, typically performed on color\nfundus images obtained by retinography. However, manually performing these\ntasks is labor-intensive and prone to human error. While several automated\nmethods have been proposed to address this task, the current state of art faces\nchallenges due to manifest classification errors affecting the topological\nconsistency of segmentation maps. In this work, we introduce RRWNet, a novel\nend-to-end deep learning framework that addresses this limitation. The\nframework consists of a fully convolutional neural network that recursively\nrefines semantic segmentation maps, correcting manifest classification errors\nand thus improving topological consistency. In particular, RRWNet is composed\nof two specialized subnetworks: a Base subnetwork that generates base\nsegmentation maps from the input images, and a Recursive Refinement subnetwork\nthat iteratively and recursively improves these maps. Evaluation on three\ndifferent public datasets demonstrates the state-of-the-art performance of the\nproposed method, yielding more topologically consistent segmentation maps with\nfewer manifest classification errors than existing approaches. In addition, the\nRecursive Refinement module within RRWNet proves effective in post-processing\nsegmentation maps from other methods, further demonstrating its potential. The\nmodel code, weights, and predictions will be publicly available at\nhttps://github.com/j-morano/rrwnet.\n","authors":["José Morano","Guilherme Aresta","Hrvoje Bogunović"],"pdf_url":"https://arxiv.org/pdf/2402.03166v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02517v1","updated":"2024-04-03T07:10:18Z","published":"2024-04-03T07:10:18Z","title":"HENet: Hybrid Encoding for End-to-end Multi-task 3D Perception from\n Multi-view Cameras","summary":" Three-dimensional perception from multi-view cameras is a crucial component\nin autonomous driving systems, which involves multiple tasks like 3D object\ndetection and bird's-eye-view (BEV) semantic segmentation. To improve\nperception precision, large image encoders, high-resolution images, and\nlong-term temporal inputs have been adopted in recent 3D perception models,\nbringing remarkable performance gains. However, these techniques are often\nincompatible in training and inference scenarios due to computational resource\nconstraints. Besides, modern autonomous driving systems prefer to adopt an\nend-to-end framework for multi-task 3D perception, which can simplify the\noverall system architecture and reduce the implementation complexity. However,\nconflict between tasks often arises when optimizing multiple tasks jointly\nwithin an end-to-end 3D perception model. To alleviate these issues, we present\nan end-to-end framework named HENet for multi-task 3D perception in this paper.\nSpecifically, we propose a hybrid image encoding network, using a large image\nencoder for short-term frames and a small image encoder for long-term temporal\nframes. Then, we introduce a temporal feature integration module based on the\nattention mechanism to fuse the features of different frames extracted by the\ntwo aforementioned hybrid image encoders. Finally, according to the\ncharacteristics of each perception task, we utilize BEV features of different\ngrid sizes, independent BEV encoders, and task decoders for different tasks.\nExperimental results show that HENet achieves state-of-the-art end-to-end\nmulti-task 3D perception results on the nuScenes benchmark, including 3D object\ndetection and BEV semantic segmentation. The source code and models will be\nreleased at https://github.com/VDIGPKU/HENet.\n","authors":["Zhongyu Xia","ZhiWei Lin","Xinhao Wang","Yongtao Wang","Yun Xing","Shengxiang Qi","Nan Dong","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.02517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02514v1","updated":"2024-04-03T07:07:02Z","published":"2024-04-03T07:07:02Z","title":"Freditor: High-Fidelity and Transferable NeRF Editing by Frequency\n Decomposition","summary":" This paper enables high-fidelity, transferable NeRF editing by frequency\ndecomposition. Recent NeRF editing pipelines lift 2D stylization results to 3D\nscenes while suffering from blurry results, and fail to capture detailed\nstructures caused by the inconsistency between 2D editings. Our critical\ninsight is that low-frequency components of images are more\nmultiview-consistent after editing compared with their high-frequency parts.\nMoreover, the appearance style is mainly exhibited on the low-frequency\ncomponents, and the content details especially reside in high-frequency parts.\nThis motivates us to perform editing on low-frequency components, which results\nin high-fidelity edited scenes. In addition, the editing is performed in the\nlow-frequency feature space, enabling stable intensity control and novel scene\ntransfer. Comprehensive experiments conducted on photorealistic datasets\ndemonstrate the superior performance of high-fidelity and transferable NeRF\nediting. The project page is at \\url{https://aigc3d.github.io/freditor}.\n","authors":["Yisheng He","Weihao Yuan","Siyu Zhu","Zilong Dong","Liefeng Bo","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2404.02514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02508v1","updated":"2024-04-03T06:53:27Z","published":"2024-04-03T06:53:27Z","title":"VIAssist: Adapting Multi-modal Large Language Models for Users with\n Visual Impairments","summary":" Individuals with visual impairments, encompassing both partial and total\ndifficulties in visual perception, are referred to as visually impaired (VI)\npeople. An estimated 2.2 billion individuals worldwide are affected by visual\nimpairments. Recent advancements in multi-modal large language models (MLLMs)\nhave showcased their extraordinary capabilities across various domains. It is\ndesirable to help VI individuals with MLLMs' great capabilities of visual\nunderstanding and reasoning. However, it is challenging for VI people to use\nMLLMs due to the difficulties in capturing the desirable images to fulfill\ntheir daily requests. For example, the target object is not fully or partially\nplaced in the image. This paper explores how to leverage MLLMs for VI\nindividuals to provide visual-question answers. VIAssist can identify undesired\nimages and provide detailed actions. Finally, VIAssist can provide reliable\nanswers to users' queries based on the images. Our results show that VIAssist\nprovides +0.21 and +0.31 higher BERTScore and ROUGE scores than the baseline,\nrespectively.\n","authors":["Bufang Yang","Lixing He","Kaiwei Liu","Zhenyu Yan"],"pdf_url":"https://arxiv.org/pdf/2404.02508v1.pdf","comment":"Accepted to IEEE International Workshop on Foundation Models for\n Cyber-Physical Systems & Internet of Things (FMSys 2024)"},{"id":"http://arxiv.org/abs/2303.04989v2","updated":"2024-04-03T06:51:21Z","published":"2023-03-09T02:20:56Z","title":"ARS-DETR: Aspect Ratio Sensitive Oriented Object Detection with\n Transformer","summary":" Existing oriented object detection methods commonly use metric AP$_{50}$ to\nmeasure the performance of the model. We argue that AP$_{50}$ is inherently\nunsuitable for oriented object detection due to its large tolerance in angle\ndeviation. Therefore, we advocate using high-precision metric, e.g. AP$_{75}$,\nto measure the performance of models. In this paper, we propose an Aspect Ratio\nSensitive Oriented Object Detector with Transformer, termed ARS-DETR, which\nexhibits a competitive performance in high-precision oriented object detection.\nSpecifically, a new angle classification method, calling Aspect Ratio aware\nCircle Smooth Label (AR-CSL), is proposed to smooth the angle label in a more\nreasonable way and discard the hyperparameter that introduced by previous work\n(e.g. CSL). Then, a rotated deformable attention module is designed to rotate\nthe sampling points with the corresponding angles and eliminate the\nmisalignment between region features and sampling points. Moreover, a dynamic\nweight coefficient according to the aspect ratio is adopted to calculate the\nangle loss. Comprehensive experiments on several challenging datasets show that\nour method achieves competitive performance on the high-precision oriented\nobject detection task.\n","authors":["Ying Zeng","Xue Yang","Qingyun Li","Yushi Chen","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2303.04989v2.pdf","comment":"10 pages, 8 figures, 8 tables, the source code is available at\n https://github.com/httle/ARS-DETR"},{"id":"http://arxiv.org/abs/2403.04492v3","updated":"2024-04-03T06:48:15Z","published":"2024-03-07T13:49:29Z","title":"Discriminative Sample-Guided and Parameter-Efficient Feature Space\n Adaptation for Cross-Domain Few-Shot Learning","summary":" In this paper, we look at cross-domain few-shot classification which presents\nthe challenging task of learning new classes in previously unseen domains with\nfew labelled examples. Existing methods, though somewhat effective, encounter\nseveral limitations, which we alleviate through two significant improvements.\nFirst, we introduce a lightweight parameter-efficient adaptation strategy to\naddress overfitting associated with fine-tuning a large number of parameters on\nsmall datasets. This strategy employs a linear transformation of pre-trained\nfeatures, significantly reducing the trainable parameter count. Second, we\nreplace the traditional nearest centroid classifier with a discriminative\nsample-aware loss function, enhancing the model's sensitivity to the inter- and\nintra-class variances within the training set for improved clustering in\nfeature space. Empirical evaluations on the Meta-Dataset benchmark showcase\nthat our approach not only improves accuracy up to 7.7\\% and 5.3\\% on\npreviously seen and unseen datasets, respectively, but also achieves the above\nperformance while being at least $\\sim3\\times$ more parameter-efficient than\nexisting methods, establishing a new state-of-the-art in cross-domain few-shot\nlearning. Our code is available at https://github.com/rashindrie/DIPA.\n","authors":["Rashindrie Perera","Saman Halgamuge"],"pdf_url":"https://arxiv.org/pdf/2403.04492v3.pdf","comment":"Code is available at this link: https://github.com/rashindrie/DIPA"},{"id":"http://arxiv.org/abs/2404.01700v2","updated":"2024-04-03T06:40:46Z","published":"2024-04-02T07:09:29Z","title":"MotionChain: Conversational Motion Controllers via Multimodal Prompts","summary":" Recent advancements in language models have demonstrated their adeptness in\nconducting multi-turn dialogues and retaining conversational context. However,\nthis proficiency remains largely unexplored in other multimodal generative\nmodels, particularly in human motion models. By integrating multi-turn\nconversations in controlling continuous virtual human movements, generative\nhuman motion models can achieve an intuitive and step-by-step process of human\ntask execution for humanoid robotics, game agents, or other embodied systems.\nIn this work, we present MotionChain, a conversational human motion controller\nto generate continuous and long-term human motion through multimodal prompts.\nSpecifically, MotionChain consists of multi-modal tokenizers that transform\nvarious data types such as text, image, and motion, into discrete tokens,\ncoupled with a Vision-Motion-aware Language model. By leveraging large-scale\nlanguage, vision-language, and vision-motion data to assist motion-related\ngeneration tasks, MotionChain thus comprehends each instruction in multi-turn\nconversation and generates human motions followed by these prompts. Extensive\nexperiments validate the efficacy of MotionChain, demonstrating\nstate-of-the-art performance in conversational motion generation, as well as\nmore intuitive manners of controlling and interacting with virtual humans.\n","authors":["Biao Jiang","Xin Chen","Chi Zhang","Fukun Yin","Zhuoyuan Li","Gang YU","Jiayuan Fan"],"pdf_url":"https://arxiv.org/pdf/2404.01700v2.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.12870v2","updated":"2024-04-03T06:11:17Z","published":"2023-12-20T09:34:22Z","title":"The Audio-Visual Conversational Graph: From an Egocentric-Exocentric\n Perspective","summary":" In recent years, the thriving development of research related to egocentric\nvideos has provided a unique perspective for the study of conversational\ninteractions, where both visual and audio signals play a crucial role. While\nmost prior work focus on learning about behaviors that directly involve the\ncamera wearer, we introduce the Ego-Exocentric Conversational Graph Prediction\nproblem, marking the first attempt to infer exocentric conversational\ninteractions from egocentric videos. We propose a unified multi-modal framework\n-- Audio-Visual Conversational Attention (AV-CONV), for the joint prediction of\nconversation behaviors -- speaking and listening -- for both the camera wearer\nas well as all other social partners present in the egocentric video.\nSpecifically, we adopt the self-attention mechanism to model the\nrepresentations across-time, across-subjects, and across-modalities. To\nvalidate our method, we conduct experiments on a challenging egocentric video\ndataset that includes multi-speaker and multi-conversation scenarios. Our\nresults demonstrate the superior performance of our method compared to a series\nof baselines. We also present detailed ablation studies to assess the\ncontribution of each component in our model. Check our project page at\nhttps://vjwq.github.io/AV-CONV/.\n","authors":["Wenqi Jia","Miao Liu","Hao Jiang","Ishwarya Ananthabhotla","James M. Rehg","Vamsi Krishna Ithapu","Ruohan Gao"],"pdf_url":"https://arxiv.org/pdf/2312.12870v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01941v2","updated":"2024-04-03T05:43:15Z","published":"2024-04-02T13:33:31Z","title":"LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging","summary":" Human pose and shape (HPS) estimation with lensless imaging is not only\nbeneficial to privacy protection but also can be used in covert surveillance\nscenarios due to the small size and simple structure of this device. However,\nthis task presents significant challenges due to the inherent ambiguity of the\ncaptured measurements and lacks effective methods for directly estimating human\npose and shape from lensless data. In this paper, we propose the first\nend-to-end framework to recover 3D human poses and shapes from lensless\nmeasurements to our knowledge. We specifically design a multi-scale lensless\nfeature decoder to decode the lensless measurements through the optically\nencoded mask for efficient feature extraction. We also propose a double-head\nauxiliary supervision mechanism to improve the estimation accuracy of human\nlimb ends. Besides, we establish a lensless imaging system and verify the\neffectiveness of our method on various datasets acquired by our lensless\nimaging system.\n","authors":["Haoyang Ge","Qiao Feng","Hailong Jia","Xiongzheng Li","Xiangjun Yin","You Zhou","Jingyu Yang","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2404.01941v2.pdf","comment":"Accepted to CVPR 2024. More results available at\n https://cic.tju.edu.cn/faculty/likun/projects/LPSNet"},{"id":"http://arxiv.org/abs/2403.02561v2","updated":"2024-04-03T05:43:10Z","published":"2024-03-05T00:34:05Z","title":"Semantic Human Mesh Reconstruction with Textures","summary":" The field of 3D detailed human mesh reconstruction has made significant\nprogress in recent years. However, current methods still face challenges when\nused in industrial applications due to unstable results, low-quality meshes,\nand a lack of UV unwrapping and skinning weights. In this paper, we present\nSHERT, a novel pipeline that can reconstruct semantic human meshes with\ntextures and high-precision details. SHERT applies semantic- and normal-based\nsampling between the detailed surface (e.g. mesh and SDF) and the corresponding\nSMPL-X model to obtain a partially sampled semantic mesh and then generates the\ncomplete semantic mesh by our specifically designed self-supervised completion\nand refinement networks. Using the complete semantic mesh as a basis, we employ\na texture diffusion model to create human textures that are driven by both\nimages and texts. Our reconstructed meshes have stable UV unwrapping,\nhigh-quality triangle meshes, and consistent semantic information. The given\nSMPL-X model provides semantic information and shape priors, allowing SHERT to\nperform well even with incorrect and incomplete inputs. The semantic\ninformation also makes it easy to substitute and animate different body parts\nsuch as the face, body, and hands. Quantitative and qualitative experiments\ndemonstrate that SHERT is capable of producing high-fidelity and robust\nsemantic meshes that outperform state-of-the-art methods.\n","authors":["Xiaoyu Zhan","Jianxin Yang","Yuanqi Li","Jie Guo","Yanwen Guo","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2403.02561v2.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://zhanxy.xyz/projects/shert/"},{"id":"http://arxiv.org/abs/2404.02462v1","updated":"2024-04-03T05:04:55Z","published":"2024-04-03T05:04:55Z","title":"A Unified Membership Inference Method for Visual Self-supervised Encoder\n via Part-aware Capability","summary":" Self-supervised learning shows promise in harnessing extensive unlabeled\ndata, but it also confronts significant privacy concerns, especially in vision.\nIn this paper, we aim to perform membership inference on visual self-supervised\nmodels in a more realistic setting: self-supervised training method and details\nare unknown for an adversary when attacking as he usually faces a black-box\nsystem in practice. In this setting, considering that self-supervised model\ncould be trained by completely different self-supervised paradigms, e.g.,\nmasked image modeling and contrastive learning, with complex training details,\nwe propose a unified membership inference method called PartCrop. It is\nmotivated by the shared part-aware capability among models and stronger part\nresponse on the training data. Specifically, PartCrop crops parts of objects in\nan image to query responses with the image in representation space. We conduct\nextensive attacks on self-supervised models with different training protocols\nand structures using three widely used image datasets. The results verify the\neffectiveness and generalization of PartCrop. Moreover, to defend against\nPartCrop, we evaluate two common approaches, i.e., early stop and differential\nprivacy, and propose a tailored method called shrinking crop scale range. The\ndefense experiments indicate that all of them are effective. Our code is\navailable at https://github.com/JiePKU/PartCrop\n","authors":["Jie Zhu","Jirong Zha","Ding Li","Leye Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02462v1.pdf","comment":"Membership Inference, Self-supervised learning"},{"id":"http://arxiv.org/abs/2404.02460v1","updated":"2024-04-03T05:02:46Z","published":"2024-04-03T05:02:46Z","title":"TSNet:A Two-stage Network for Image Dehazing with Multi-scale Fusion and\n Adaptive Learning","summary":" Image dehazing has been a popular topic of research for a long time. Previous\ndeep learning-based image dehazing methods have failed to achieve satisfactory\ndehazing effects on both synthetic datasets and real-world datasets, exhibiting\npoor generalization. Moreover, single-stage networks often result in many\nregions with artifacts and color distortion in output images. To address these\nissues, this paper proposes a two-stage image dehazing network called TSNet,\nmainly consisting of the multi-scale fusion module (MSFM) and the adaptive\nlearning module (ALM). Specifically, MSFM and ALM enhance the generalization of\nTSNet. The MSFM can obtain large receptive fields at multiple scales and\nintegrate features at different frequencies to reduce the differences between\ninputs and learning objectives. The ALM can actively learn of regions of\ninterest in images and restore texture details more effectively. Additionally,\nTSNet is designed as a two-stage network, where the first-stage network\nperforms image dehazing, and the second-stage network is employed to improve\nissues such as artifacts and color distortion present in the results of the\nfirst-stage network. We also change the learning objective from ground truth\nimages to opposite fog maps, which improves the learning efficiency of TSNet.\nExtensive experiments demonstrate that TSNet exhibits superior dehazing\nperformance on both synthetic and real-world datasets compared to previous\nstate-of-the-art methods.\n","authors":["Xiaolin Gong","Zehan Zheng","Heyuan Du"],"pdf_url":"https://arxiv.org/pdf/2404.02460v1.pdf","comment":"12 pages, 10 figures, 7 tables"},{"id":"http://arxiv.org/abs/2404.02457v1","updated":"2024-04-03T04:59:28Z","published":"2024-04-03T04:59:28Z","title":"RS3Mamba: Visual State Space Model for Remote Sensing Images Semantic\n Segmentation","summary":" Semantic segmentation of remote sensing images is a fundamental task in\ngeoscience research. However, there are some significant shortcomings for the\nwidely used convolutional neural networks (CNNs) and Transformers. The former\nis limited by its insufficient long-range modeling capabilities, while the\nlatter is hampered by its computational complexity. Recently, a novel visual\nstate space (VSS) model represented by Mamba has emerged, capable of modeling\nlong-range relationships with linear computability. In this work, we propose a\nnovel dual-branch network named remote sensing images semantic segmentation\nMamba (RS3Mamba) to incorporate this innovative technology into remote sensing\ntasks. Specifically, RS3Mamba utilizes VSS blocks to construct an auxiliary\nbranch, providing additional global information to convolution-based main\nbranch. Moreover, considering the distinct characteristics of the two branches,\nwe introduce a collaborative completion module (CCM) to enhance and fuse\nfeatures from the dual-encoder. Experimental results on two widely used\ndatasets, ISPRS Vaihingen and LoveDA Urban, demonstrate the effectiveness and\npotential of the proposed RS3Mamba. To the best of our knowledge, this is the\nfirst vision Mamba specifically designed for remote sensing images semantic\nsegmentation. The source code will be made available at\nhttps://github.com/sstary/SSRS.\n","authors":["Xianping Ma","Xiaokang Zhang","Man-On Pun"],"pdf_url":"https://arxiv.org/pdf/2404.02457v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.02447v1","updated":"2024-04-03T04:26:50Z","published":"2024-04-03T04:26:50Z","title":"A Novel Approach to Breast Cancer Histopathological Image Classification\n Using Cross-Colour Space Feature Fusion and Quantum-Classical Stack Ensemble\n Method","summary":" Breast cancer classification stands as a pivotal pillar in ensuring timely\ndiagnosis and effective treatment. This study with histopathological images\nunderscores the profound significance of harnessing the synergistic\ncapabilities of colour space ensembling and quantum-classical stacking to\nelevate the precision of breast cancer classification. By delving into the\ndistinct colour spaces of RGB, HSV and CIE L*u*v, the authors initiated a\ncomprehensive investigation guided by advanced methodologies. Employing the\nDenseNet121 architecture for feature extraction the authors have capitalized on\nthe robustness of Random Forest, SVM, QSVC, and VQC classifiers. This research\nencompasses a unique feature fusion technique within the colour space ensemble.\nThis approach not only deepens our comprehension of breast cancer\nclassification but also marks a milestone in personalized medical assessment.\nThe amalgamation of quantum and classical classifiers through stacking emerges\nas a potent catalyst, effectively mitigating the inherent constraints of\nindividual classifiers, paving a robust path towards more dependable and\nrefined breast cancer identification. Through rigorous experimentation and\nmeticulous analysis, fusion of colour spaces like RGB with HSV and RGB with CIE\nL*u*v, presents an classification accuracy, nearing the value of unity. This\nunderscores the transformative potential of our approach, where the fusion of\ndiverse colour spaces and the synergy of quantum and classical realms converge\nto establish a new horizon in medical diagnostics. Thus the implications of\nthis research extend across medical disciplines, offering promising avenues for\nadvancing diagnostic accuracy and treatment efficacy.\n","authors":["Sambit Mallick","Snigdha Paul","Anindya Sen"],"pdf_url":"https://arxiv.org/pdf/2404.02447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11994v2","updated":"2024-04-03T04:07:50Z","published":"2023-12-19T09:37:25Z","title":"Optimizing Diffusion Noise Can Serve As Universal Motion Priors","summary":" We propose Diffusion Noise Optimization (DNO), a new method that effectively\nleverages existing motion diffusion models as motion priors for a wide range of\nmotion-related tasks. Instead of training a task-specific diffusion model for\neach new task, DNO operates by optimizing the diffusion latent noise of an\nexisting pre-trained text-to-motion model. Given the corresponding latent noise\nof a human motion, it propagates the gradient from the target criteria defined\non the motion space through the whole denoising process to update the diffusion\nlatent noise. As a result, DNO supports any use cases where criteria can be\ndefined as a function of motion. In particular, we show that, for motion\nediting and control, DNO outperforms existing methods in both achieving the\nobjective and preserving the motion content. DNO accommodates a diverse range\nof editing modes, including changing trajectory, pose, joint locations, or\navoiding newly added obstacles. In addition, DNO is effective in motion\ndenoising and completion, producing smooth and realistic motion from noisy and\npartial inputs. DNO achieves these results at inference time without the need\nfor model retraining, offering great versatility for any defined reward or loss\nfunction on the motion representation.\n","authors":["Korrawe Karunratanakul","Konpat Preechakul","Emre Aksan","Thabo Beeler","Supasorn Suwajanakorn","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2312.11994v2.pdf","comment":"CVPR 2024. Project page: https://korrawe.github.io/dno-project/"},{"id":"http://arxiv.org/abs/2403.11056v2","updated":"2024-04-03T04:00:53Z","published":"2024-03-17T02:06:03Z","title":"Analytic-Splatting: Anti-Aliased 3D Gaussian Splatting via Analytic\n Integration","summary":" The 3D Gaussian Splatting (3DGS) gained its popularity recently by combining\nthe advantages of both primitive-based and volumetric 3D representations,\nresulting in improved quality and efficiency for 3D scene rendering. However,\n3DGS is not alias-free, and its rendering at varying resolutions could produce\nsevere blurring or jaggies. This is because 3DGS treats each pixel as an\nisolated, single point rather than as an area, causing insensitivity to changes\nin the footprints of pixels. Consequently, this discrete sampling scheme\ninevitably results in aliasing, owing to the restricted sampling bandwidth. In\nthis paper, we derive an analytical solution to address this issue. More\nspecifically, we use a conditioned logistic function as the analytic\napproximation of the cumulative distribution function (CDF) in a\none-dimensional Gaussian signal and calculate the Gaussian integral by\nsubtracting the CDFs. We then introduce this approximation in the\ntwo-dimensional pixel shading, and present Analytic-Splatting, which\nanalytically approximates the Gaussian integral within the 2D-pixel window area\nto better capture the intensity response of each pixel. Moreover, we use the\napproximated response of the pixel window integral area to participate in the\ntransmittance calculation of volume rendering, making Analytic-Splatting\nsensitive to the changes in pixel footprint at different resolutions.\nExperiments on various datasets validate that our approach has better\nanti-aliasing capability that gives more details and better fidelity.\n","authors":["Zhihao Liang","Qi Zhang","Wenbo Hu","Ying Feng","Lei Zhu","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2403.11056v2.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2401.13201v2","updated":"2024-04-03T03:52:44Z","published":"2024-01-24T03:07:26Z","title":"MLLMReID: Multimodal Large Language Model-based Person Re-identification","summary":" Multimodal large language models (MLLM) have achieved satisfactory results in\nmany tasks. However, their performance in the task of person re-identification\n(ReID) has not been explored to date. This paper will investigate how to adapt\nthem for the task of ReID. An intuitive idea is to fine-tune MLLM with ReID\nimage-text datasets, and then use their visual encoder as a backbone for ReID.\nHowever, there still exist two apparent issues: (1) Designing instructions for\nReID, MLLMs may overfit specific instructions, and designing a variety of\ninstructions will lead to higher costs. (2) Latent image feature vectors from\nLLMs are not involved in loss computation. Instructional learning, aligning\nimage-text features, results in indirect optimization and a learning objective\nthat inadequately utilizes features, limiting effectiveness in person feature\nlearning. To address these problems, this paper proposes MLLMReID: Multimodal\nLarge Language Model-based ReID. Firstly, we proposed Common Instruction, a\nsimple approach that leverages the essence ability of LLMs to continue writing,\navoiding complex and diverse instruction design. Secondly, we proposed\nDirectReID, which effectively employs the latent image feature vectors of\nimages outputted by LLMs in ReID tasks. The experimental results demonstrate\nthe superiority of our method. We will open-source the code on GitHub.\n","authors":["Shan Yang","Yongfei Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.13201v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02402v3","updated":"2024-04-03T03:45:38Z","published":"2024-01-04T18:39:32Z","title":"3D Open-Vocabulary Panoptic Segmentation with 2D-3D Vision-Language\n Distillation","summary":" 3D panoptic segmentation is a challenging perception task, especially in\nautonomous driving. It aims to predict both semantic and instance annotations\nfor 3D points in a scene. Although prior 3D panoptic segmentation approaches\nhave achieved great performance on closed-set benchmarks, generalizing these\napproaches to unseen things and unseen stuff categories remains an open\nproblem. For unseen object categories, 2D open-vocabulary segmentation has\nachieved promising results that solely rely on frozen CLIP backbones and\nensembling multiple classification outputs. However, we find that simply\nextending these 2D models to 3D does not guarantee good performance due to poor\nper-mask classification quality, especially for novel stuff categories. In this\npaper, we propose the first method to tackle 3D open-vocabulary panoptic\nsegmentation. Our model takes advantage of the fusion between learnable LiDAR\nfeatures and dense frozen vision CLIP features, using a single classification\nhead to make predictions for both base and novel classes. To further improve\nthe classification performance on novel classes and leverage the CLIP model, we\npropose two novel loss functions: object-level distillation loss and\nvoxel-level distillation loss. Our experiments on the nuScenes and\nSemanticKITTI datasets show that our method outperforms the strong baseline by\na large margin.\n","authors":["Zihao Xiao","Longlong Jing","Shangxuan Wu","Alex Zihao Zhu","Jingwei Ji","Chiyu Max Jiang","Wei-Chih Hung","Thomas Funkhouser","Weicheng Kuo","Anelia Angelova","Yin Zhou","Shiwei Sheng"],"pdf_url":"https://arxiv.org/pdf/2401.02402v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02424v1","updated":"2024-04-03T03:27:01Z","published":"2024-04-03T03:27:01Z","title":"RESSA: Repair Sparse Vision-Language Models via Sparse Cross-Modality\n Adaptation","summary":" Vision-Language Models (VLMs), integrating diverse information from multiple\nmodalities, have shown remarkable success across various tasks. However,\ndeploying VLMs, comprising large-scale vision and language models poses\nchallenges in resource-constrained scenarios. While pruning followed by\nfinetuning offers a potential solution to maintain performance with smaller\nmodel sizes, its application to VLMs remains relatively unexplored, presenting\ntwo main questions: how to distribute sparsity across different\nmodality-specific models, and how to repair the performance of pruned sparse\nVLMs. To answer the first question, we conducted preliminary studies on VLM\npruning and found that pruning vision models and language models with the same\nsparsity ratios contribute to nearly optimal performance. For the second\nquestion, unlike finetuning unimodal sparse models, sparse VLMs involve\ncross-modality interactions, requiring specialized techniques for post-pruning\nperformance repair. Moreover, while parameter-efficient LoRA finetuning has\nbeen proposed to repair the performance of sparse models, a significant\nchallenge of weights merging arises due to the incompatibility of dense LoRA\nmodules with sparse models that destroy the sparsity of pruned models. To\ntackle these challenges, we propose to Repair Sparse Vision-Language Models via\nSparse Cross-modality Adaptation (RESSA). RESSA utilizes cross-modality\nfinetuning to enhance task-specific performance and facilitate knowledge\ndistillation from original dense models. Additionally, we introduce SparseLoRA,\nwhich applies sparsity directly to LoRA weights, enabling seamless integration\nwith sparse models. Our experimental results validate the effectiveness of\nRESSA, showcasing significant enhancements, such as an 11.3\\% improvement under\n2:4 sparsity and a remarkable 47.6\\% enhancement under unstructured 70\\%\nsparsity.\n","authors":["Shwai He","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.15472v4","updated":"2024-04-03T03:15:55Z","published":"2022-06-30T17:59:08Z","title":"On-Device Training Under 256KB Memory","summary":" On-device training enables the model to adapt to new data collected from the\nsensors by fine-tuning a pre-trained model. Users can benefit from customized\nAI models without having to transfer the data to the cloud, protecting the\nprivacy. However, the training memory consumption is prohibitive for IoT\ndevices that have tiny memory resources. We propose an algorithm-system\nco-design framework to make on-device training possible with only 256KB of\nmemory. On-device training faces two unique challenges: (1) the quantized\ngraphs of neural networks are hard to optimize due to low bit-precision and the\nlack of normalization; (2) the limited hardware resource does not allow full\nback-propagation. To cope with the optimization difficulty, we propose\nQuantization-Aware Scaling to calibrate the gradient scales and stabilize 8-bit\nquantized training. To reduce the memory footprint, we propose Sparse Update to\nskip the gradient computation of less important layers and sub-tensors. The\nalgorithm innovation is implemented by a lightweight training system, Tiny\nTraining Engine, which prunes the backward computation graph to support sparse\nupdates and offload the runtime auto-differentiation to compile time. Our\nframework is the first solution to enable tiny on-device training of\nconvolutional neural networks under 256KB SRAM and 1MB Flash without auxiliary\nmemory, using less than 1/1000 of the memory of PyTorch and TensorFlow while\nmatching the accuracy on tinyML application VWW. Our study enables IoT devices\nnot only to perform inference but also to continuously adapt to new data for\non-device lifelong learning. A video demo can be found here:\nhttps://youtu.be/0pUFZYdoMY8.\n","authors":["Ji Lin","Ligeng Zhu","Wei-Ming Chen","Wei-Chen Wang","Chuang Gan","Song Han"],"pdf_url":"https://arxiv.org/pdf/2206.15472v4.pdf","comment":"NeurIPS 2022"},{"id":"http://arxiv.org/abs/2403.19428v2","updated":"2024-04-03T02:59:24Z","published":"2024-03-28T13:58:05Z","title":"Burst Super-Resolution with Diffusion Models for Improving Perceptual\n Quality","summary":" While burst LR images are useful for improving the SR image quality compared\nwith a single LR image, prior SR networks accepting the burst LR images are\ntrained in a deterministic manner, which is known to produce a blurry SR image.\nIn addition, it is difficult to perfectly align the burst LR images, making the\nSR image more blurry. Since such blurry images are perceptually degraded, we\naim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity\nimages can be reconstructed by diffusion models. However, prior SR methods\nusing the diffusion model are not properly optimized for the burst SR task.\nSpecifically, the reverse process starting from a random sample is not\noptimized for image enhancement and restoration methods, including burst SR. In\nour proposed method, on the other hand, burst LR features are used to\nreconstruct the initial burst SR image that is fed into an intermediate step in\nthe diffusion model. This reverse process from the intermediate step 1) skips\ndiffusion steps for reconstructing the global structure of the image and 2)\nfocuses on steps for refining detailed textures. Our experimental results\ndemonstrate that our method can improve the scores of the perceptual quality\nmetrics. Code: https://github.com/placerkyo/BSRD\n","authors":["Kyotaro Tokoro","Kazutoshi Akita","Norimichi Ukita"],"pdf_url":"https://arxiv.org/pdf/2403.19428v2.pdf","comment":"Accepted to IJCNN 2024 (International Joint Conference on Neural\n Networks)"},{"id":"http://arxiv.org/abs/2403.19920v2","updated":"2024-04-03T02:48:47Z","published":"2024-03-29T02:17:09Z","title":"MI-NeRF: Learning a Single Face NeRF from Multiple Identities","summary":" In this work, we introduce a method that learns a single dynamic neural\nradiance field (NeRF) from monocular talking face videos of multiple\nidentities. NeRFs have shown remarkable results in modeling the 4D dynamics and\nappearance of human faces. However, they require per-identity optimization.\nAlthough recent approaches have proposed techniques to reduce the training and\nrendering time, increasing the number of identities can be expensive. We\nintroduce MI-NeRF (multi-identity NeRF), a single unified network that models\ncomplex non-rigid facial motion for multiple identities, using only monocular\nvideos of arbitrary length. The core premise in our method is to learn the\nnon-linear interactions between identity and non-identity specific information\nwith a multiplicative module. By training on multiple videos simultaneously,\nMI-NeRF not only reduces the total training time compared to standard\nsingle-identity NeRFs, but also demonstrates robustness in synthesizing novel\nexpressions for any input identity. We present results for both facial\nexpression transfer and talking face video synthesis. Our method can be further\npersonalized for a target identity given only a short video.\n","authors":["Aggelina Chatziagapi","Grigorios G. Chrysos","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2403.19920v2.pdf","comment":"Project page: https://aggelinacha.github.io/MI-NeRF/"},{"id":"http://arxiv.org/abs/2403.14530v2","updated":"2024-04-03T02:46:54Z","published":"2024-03-21T16:28:58Z","title":"HAC: Hash-grid Assisted Context for 3D Gaussian Splatting Compression","summary":" 3D Gaussian Splatting (3DGS) has emerged as a promising framework for novel\nview synthesis, boasting rapid rendering speed with high fidelity. However, the\nsubstantial Gaussians and their associated attributes necessitate effective\ncompression techniques. Nevertheless, the sparse and unorganized nature of the\npoint cloud of Gaussians (or anchors in our paper) presents challenges for\ncompression. To address this, we make use of the relations between the\nunorganized anchors and the structured hash grid, leveraging their mutual\ninformation for context modeling, and propose a Hash-grid Assisted Context\n(HAC) framework for highly compact 3DGS representation. Our approach introduces\na binary hash grid to establish continuous spatial consistencies, allowing us\nto unveil the inherent spatial relations of anchors through a carefully\ndesigned context model. To facilitate entropy coding, we utilize Gaussian\ndistributions to accurately estimate the probability of each quantized\nattribute, where an adaptive quantization module is proposed to enable\nhigh-precision quantization of these attributes for improved fidelity\nrestoration. Additionally, we incorporate an adaptive masking strategy to\neliminate invalid Gaussians and anchors. Importantly, our work is the pioneer\nto explore context-based compression for 3DGS representation, resulting in a\nremarkable size reduction of over $75\\times$ compared to vanilla 3DGS, while\nsimultaneously improving fidelity, and achieving over $11\\times$ size reduction\nover SOTA 3DGS compression approach Scaffold-GS. Our code is available here:\nhttps://github.com/YihangChen-ee/HAC\n","authors":["Yihang Chen","Qianyi Wu","Jianfei Cai","Mehrtash Harandi","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2403.14530v2.pdf","comment":"Project Page: https://yihangchen-ee.github.io/project_hac/ Code:\n https://github.com/YihangChen-ee/HAC"},{"id":"http://arxiv.org/abs/2404.02415v1","updated":"2024-04-03T02:40:35Z","published":"2024-04-03T02:40:35Z","title":"What Are We Measuring When We Evaluate Large Vision-Language Models? An\n Analysis of Latent Factors and Biases","summary":" Vision-language (VL) models, pretrained on colossal image-text datasets, have\nattained broad VL competence that is difficult to evaluate. A common belief is\nthat a small number of VL skills underlie the variety of VL tests. In this\npaper, we perform a large-scale transfer learning experiment aimed at\ndiscovering latent VL skills from data. We reveal interesting characteristics\nthat have important implications for test suite design. First, generation tasks\nsuffer from a length bias, suggesting benchmarks should balance tasks with\nvarying output lengths. Second, we demonstrate that factor analysis\nsuccessfully identifies reasonable yet surprising VL skill factors, suggesting\nbenchmarks could leverage similar analyses for task selection. Finally, we\npresent a new dataset, OLIVE (https://github.com/jq-zh/olive-dataset), which\nsimulates user instructions in the wild and presents challenges dissimilar to\nall datasets we tested. Our findings contribute to the design of balanced and\nbroad-coverage vision-language evaluation methods.\n","authors":["Anthony Meng Huat Tiong","Junqi Zhao","Boyang Li","Junnan Li","Steven C. H. Hoi","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2404.02415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.07935v2","updated":"2024-04-03T02:27:59Z","published":"2022-04-17T05:42:41Z","title":"Causal Intervention for Subject-Deconfounded Facial Action Unit\n Recognition","summary":" Subject-invariant facial action unit (AU) recognition remains challenging for\nthe reason that the data distribution varies among subjects. In this paper, we\npropose a causal inference framework for subject-invariant facial action unit\nrecognition. To illustrate the causal effect existing in AU recognition task,\nwe formulate the causalities among facial images, subjects, latent AU semantic\nrelations, and estimated AU occurrence probabilities via a structural causal\nmodel. By constructing such a causal diagram, we clarify the causal effect\namong variables and propose a plug-in causal intervention module, CIS, to\ndeconfound the confounder \\emph{Subject} in the causal diagram. Extensive\nexperiments conducted on two commonly used AU benchmark datasets, BP4D and\nDISFA, show the effectiveness of our CIS, and the model with CIS inserted,\nCISNet, has achieved state-of-the-art performance.\n","authors":["Yingjie Chen","Diqi Chen","Tao Wang","Yizhou Wang","Yun Liang"],"pdf_url":"https://arxiv.org/pdf/2204.07935v2.pdf","comment":"Accepted by AAAI2022"},{"id":"http://arxiv.org/abs/2404.02410v1","updated":"2024-04-03T02:26:15Z","published":"2024-04-03T02:26:15Z","title":"TCLC-GS: Tightly Coupled LiDAR-Camera Gaussian Splatting for Surrounding\n Autonomous Driving Scenes","summary":" Most 3D Gaussian Splatting (3D-GS) based methods for urban scenes initialize\n3D Gaussians directly with 3D LiDAR points, which not only underutilizes LiDAR\ndata capabilities but also overlooks the potential advantages of fusing LiDAR\nwith camera data. In this paper, we design a novel tightly coupled LiDAR-Camera\nGaussian Splatting (TCLC-GS) to fully leverage the combined strengths of both\nLiDAR and camera sensors, enabling rapid, high-quality 3D reconstruction and\nnovel view RGB/depth synthesis. TCLC-GS designs a hybrid explicit (colorized 3D\nmesh) and implicit (hierarchical octree feature) 3D representation derived from\nLiDAR-camera data, to enrich the properties of 3D Gaussians for splatting. 3D\nGaussian's properties are not only initialized in alignment with the 3D mesh\nwhich provides more completed 3D shape and color information, but are also\nendowed with broader contextual information through retrieved octree implicit\nfeatures. During the Gaussian Splatting optimization process, the 3D mesh\noffers dense depth information as supervision, which enhances the training\nprocess by learning of a robust geometry. Comprehensive evaluations conducted\non the Waymo Open Dataset and nuScenes Dataset validate our method's\nstate-of-the-art (SOTA) performance. Utilizing a single NVIDIA RTX 3090 Ti, our\nmethod demonstrates fast training and achieves real-time RGB and depth\nrendering at 90 FPS in resolution of 1920x1280 (Waymo), and 120 FPS in\nresolution of 1600x900 (nuScenes) in urban scenarios.\n","authors":["Cheng Zhao","Su Sun","Ruoyu Wang","Yuliang Guo","Jun-Jun Wan","Zhou Huang","Xinyu Huang","Yingjie Victor Chen","Liu Ren"],"pdf_url":"https://arxiv.org/pdf/2404.02410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02405v1","updated":"2024-04-03T02:16:30Z","published":"2024-04-03T02:16:30Z","title":"TE-TAD: Towards Full End-to-End Temporal Action Detection via\n Time-Aligned Coordinate Expression","summary":" In this paper, we investigate that the normalized coordinate expression is a\nkey factor as reliance on hand-crafted components in query-based detectors for\ntemporal action detection (TAD). Despite significant advancements towards an\nend-to-end framework in object detection, query-based detectors have been\nlimited in achieving full end-to-end modeling in TAD. To address this issue, we\npropose \\modelname{}, a full end-to-end temporal action detection transformer\nthat integrates time-aligned coordinate expression. We reformulate coordinate\nexpression utilizing actual timeline values, ensuring length-invariant\nrepresentations from the extremely diverse video duration environment.\nFurthermore, our proposed adaptive query selection dynamically adjusts the\nnumber of queries based on video length, providing a suitable solution for\nvarying video durations compared to a fixed query set. Our approach not only\nsimplifies the TAD process by eliminating the need for hand-crafted components\nbut also significantly improves the performance of query-based detectors. Our\nTE-TAD outperforms the previous query-based detectors and achieves competitive\nperformance compared to state-of-the-art methods on popular benchmark datasets.\nCode is available at: https://github.com/Dotori-HJ/TE-TAD\n","authors":["Ho-Joong Kim","Jung-Ho Hong","Heejon Kong","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2404.02405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02396v1","updated":"2024-04-03T01:55:15Z","published":"2024-04-03T01:55:15Z","title":"Enhancing Diffusion-based Point Cloud Generation with Smoothness\n Constraint","summary":" Diffusion models have been popular for point cloud generation tasks. Existing\nworks utilize the forward diffusion process to convert the original point\ndistribution into a noise distribution and then learn the reverse diffusion\nprocess to recover the point distribution from the noise distribution. However,\nthe reverse diffusion process can produce samples with non-smooth points on the\nsurface because of the ignorance of the point cloud geometric properties. We\npropose alleviating the problem by incorporating the local smoothness\nconstraint into the diffusion framework for point cloud generation. Experiments\ndemonstrate the proposed model can generate realistic shapes and smoother point\nclouds, outperforming multiple state-of-the-art methods.\n","authors":["Yukun Li","Liping Liu"],"pdf_url":"https://arxiv.org/pdf/2404.02396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02394v1","updated":"2024-04-03T01:36:27Z","published":"2024-04-03T01:36:27Z","title":"Cohort-Individual Cooperative Learning for Multimodal Cancer Survival\n Analysis","summary":" Recently, we have witnessed impressive achievements in cancer survival\nanalysis by integrating multimodal data, e.g., pathology images and genomic\nprofiles. However, the heterogeneity and high dimensionality of these\nmodalities pose significant challenges for extracting discriminative\nrepresentations while maintaining good generalization. In this paper, we\npropose a Cohort-individual Cooperative Learning (CCL) framework to advance\ncancer survival analysis by collaborating knowledge decomposition and cohort\nguidance. Specifically, first, we propose a Multimodal Knowledge Decomposition\n(MKD) module to explicitly decompose multimodal knowledge into four distinct\ncomponents: redundancy, synergy and uniqueness of the two modalities. Such a\ncomprehensive decomposition can enlighten the models to perceive easily\noverlooked yet important information, facilitating an effective multimodal\nfusion. Second, we propose a Cohort Guidance Modeling (CGM) to mitigate the\nrisk of overfitting task-irrelevant information. It can promote a more\ncomprehensive and robust understanding of the underlying multimodal data, while\navoiding the pitfalls of overfitting and enhancing the generalization ability\nof the model. By cooperating the knowledge decomposition and cohort guidance\nmethods, we develop a robust multimodal survival analysis model with enhanced\ndiscrimination and generalization abilities. Extensive experimental results on\nfive cancer datasets demonstrate the effectiveness of our model in integrating\nmultimodal data for survival analysis.\n","authors":["Huajun Zhou","Fengtao Zhou","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02394v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.02391v1","updated":"2024-04-03T01:29:30Z","published":"2024-04-03T01:29:30Z","title":"APC2Mesh: Bridging the gap from occluded building façades to full 3D\n models","summary":" The benefits of having digital twins of urban buildings are numerous.\nHowever, a major difficulty encountered in their creation from airborne LiDAR\npoint clouds is the effective means of accurately reconstructing significant\nocclusions amidst point density variations and noise. To bridge the\nnoise/sparsity/occlusion gap and generate high fidelity 3D building models, we\npropose APC2Mesh which integrates point completion into a 3D reconstruction\npipeline, enabling the learning of dense geometrically accurate representation\nof buildings. Specifically, we leveraged complete points generated from\noccluded ones as input to a linearized skip attention-based deformation network\nfor 3D mesh reconstruction. In our experiments, conducted on 3 different\nscenes, we demonstrate that: (1) APC2Mesh delivers comparatively superior\nresults, indicating its efficacy in handling the challenges of occluded\nairborne building points of diverse styles and complexities. (2) The\ncombination of point completion with typical deep learning-based 3D point cloud\nreconstruction methods offers a direct and effective solution for\nreconstructing significantly occluded airborne building points. As such, this\nneural integration holds promise for advancing the creation of digital twins\nfor urban buildings with greater accuracy and fidelity.\n","authors":["Perpetual Hope Akwensi","Akshay Bharadwaj","Ruisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02391v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2404.02388v1","updated":"2024-04-03T01:13:05Z","published":"2024-04-03T01:13:05Z","title":"CAPE: CAM as a Probabilistic Ensemble for Enhanced DNN Interpretation","summary":" Deep Neural Networks (DNNs) are widely used for visual classification tasks,\nbut their complex computation process and black-box nature hinder decision\ntransparency and interpretability. Class activation maps (CAMs) and recent\nvariants provide ways to visually explain the DNN decision-making process by\ndisplaying 'attention' heatmaps of the DNNs. Nevertheless, the CAM explanation\nonly offers relative attention information, that is, on an attention heatmap,\nwe can interpret which image region is more or less important than the others.\nHowever, these regions cannot be meaningfully compared across classes, and the\ncontribution of each region to the model's class prediction is not revealed. To\naddress these challenges that ultimately lead to better DNN Interpretation, in\nthis paper, we propose CAPE, a novel reformulation of CAM that provides a\nunified and probabilistically meaningful assessment of the contributions of\nimage regions. We quantitatively and qualitatively compare CAPE with\nstate-of-the-art CAM methods on CUB and ImageNet benchmark datasets to\ndemonstrate enhanced interpretability. We also test on a cytology imaging\ndataset depicting a challenging Chronic Myelomonocytic Leukemia (CMML)\ndiagnosis problem. Code is available at: https://github.com/AIML-MED/CAPE.\n","authors":["Townim Faisal Chowdhury","Kewen Liao","Vu Minh Hieu Phan","Minh-Son To","Yutong Xie","Kevin Hung","David Ross","Anton van den Hengel","Johan W. Verjans","Zhibin Liao"],"pdf_url":"https://arxiv.org/pdf/2404.02388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02370v1","updated":"2024-04-03T00:09:05Z","published":"2024-04-03T00:09:05Z","title":"Enhancing Human-Computer Interaction in Chest X-ray Analysis using\n Vision and Language Model with Eye Gaze Patterns","summary":" Recent advancements in Computer Assisted Diagnosis have shown promising\nperformance in medical imaging tasks, particularly in chest X-ray analysis.\nHowever, the interaction between these models and radiologists has been\nprimarily limited to input images. This work proposes a novel approach to\nenhance human-computer interaction in chest X-ray analysis using\nVision-Language Models (VLMs) enhanced with radiologists' attention by\nincorporating eye gaze data alongside textual prompts. Our approach leverages\nheatmaps generated from eye gaze data, overlaying them onto medical images to\nhighlight areas of intense radiologist's focus during chest X-ray evaluation.\nWe evaluate this methodology in tasks such as visual question answering, chest\nX-ray report automation, error detection, and differential diagnosis. Our\nresults demonstrate the inclusion of eye gaze information significantly\nenhances the accuracy of chest X-ray analysis. Also, the impact of eye gaze on\nfine-tuning was confirmed as it outperformed other medical VLMs in all tasks\nexcept visual question answering. This work marks the potential of leveraging\nboth the VLM's capabilities and the radiologist's domain knowledge to improve\nthe capabilities of AI models in medical imaging, paving a novel way for\nComputer Assisted Diagnosis with a human-centred AI.\n","authors":["Yunsoo Kim","Jinge Wu","Yusuf Abdulle","Yue Gao","Honghan Wu"],"pdf_url":"https://arxiv.org/pdf/2404.02370v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.03121v1","updated":"2024-04-03T23:59:59Z","published":"2024-04-03T23:59:59Z","title":"Utilizing Computer Vision for Continuous Monitoring of Vaccine Side\n Effects in Experimental Mice","summary":" The demand for improved efficiency and accuracy in vaccine safety assessments\nis increasing. Here, we explore the application of computer vision technologies\nto automate the monitoring of experimental mice for potential side effects\nafter vaccine administration. Traditional observation methods are\nlabor-intensive and lack the capability for continuous monitoring. By deploying\na computer vision system, our research aims to improve the efficiency and\naccuracy of vaccine safety assessments. The methodology involves training\nmachine learning models on annotated video data of mice behaviors pre- and\npost-vaccination. Preliminary results indicate that computer vision effectively\nidentify subtle changes, signaling possible side effects. Therefore, our\napproach has the potential to significantly enhance the monitoring process in\nvaccine trials in animals, providing a practical solution to the limitations of\nhuman observation.\n","authors":["Chuang Li","Shuai Shao","Willian Mikason","Rubing Lin","Yantong Liu"],"pdf_url":"https://arxiv.org/pdf/2404.03121v1.pdf","comment":"1 figure"},{"id":"http://arxiv.org/abs/2404.03118v1","updated":"2024-04-03T23:57:34Z","published":"2024-04-03T23:57:34Z","title":"LVLM-Intrepret: An Interpretability Tool for Large Vision-Language\n Models","summary":" In the rapidly evolving landscape of artificial intelligence, multi-modal\nlarge language models are emerging as a significant area of interest. These\nmodels, which combine various forms of data input, are becoming increasingly\npopular. However, understanding their internal mechanisms remains a complex\ntask. Numerous advancements have been made in the field of explainability tools\nand mechanisms, yet there is still much to explore. In this work, we present a\nnovel interactive application aimed towards understanding the internal\nmechanisms of large vision-language models. Our interface is designed to\nenhance the interpretability of the image patches, which are instrumental in\ngenerating an answer, and assess the efficacy of the language model in\ngrounding its output in the image. With our application, a user can\nsystematically investigate the model and uncover system limitations, paving the\nway for enhancements in system capabilities. Finally, we present a case study\nof how our application can aid in understanding failure mechanisms in a popular\nlarge multi-modal model: LLaVA.\n","authors":["Gabriela Ben Melech Stan","Raanan Yehezkel Rohekar","Yaniv Gurwicz","Matthew Lyle Olson","Anahita Bhiwandiwalla","Estelle Aflalo","Chenfei Wu","Nan Duan","Shao-Yen Tseng","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2404.03118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03110v1","updated":"2024-04-03T23:24:25Z","published":"2024-04-03T23:24:25Z","title":"Ego-Motion Aware Target Prediction Module for Robust Multi-Object\n Tracking","summary":" Multi-object tracking (MOT) is a prominent task in computer vision with\napplication in autonomous driving, responsible for the simultaneous tracking of\nmultiple object trajectories. Detection-based multi-object tracking (DBT)\nalgorithms detect objects using an independent object detector and predict the\nimminent location of each target. Conventional prediction methods in DBT\nutilize Kalman Filter(KF) to extrapolate the target location in the upcoming\nframes by supposing a constant velocity motion model. These methods are\nespecially hindered in autonomous driving applications due to dramatic camera\nmotion or unavailable detections. Such limitations lead to tracking failures\nmanifested by numerous identity switches and disrupted trajectories. In this\npaper, we introduce a novel KF-based prediction module called the Ego-motion\nAware Target Prediction (EMAP) module by focusing on the integration of camera\nmotion and depth information with object motion models. Our proposed method\ndecouples the impact of camera rotational and translational velocity from the\nobject trajectories by reformulating the Kalman Filter. This reformulation\nenables us to reject the disturbances caused by camera motion and maximizes the\nreliability of the object motion model. We integrate our module with four\nstate-of-the-art base MOT algorithms, namely OC-SORT, Deep OC-SORT, ByteTrack,\nand BoT-SORT. In particular, our evaluation on the KITTI MOT dataset\ndemonstrates that EMAP remarkably drops the number of identity switches (IDSW)\nof OC-SORT and Deep OC-SORT by 73% and 21%, respectively. At the same time, it\nelevates other performance metrics such as HOTA by more than 5%. Our source\ncode is available at https://github.com/noyzzz/EMAP.\n","authors":["Navid Mahdian","Mohammad Jani","Amir M. Soufi Enayati","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2404.03110v1.pdf","comment":"7 pages, 4 figures, submitted to IROS2024"},{"id":"http://arxiv.org/abs/2404.03109v1","updated":"2024-04-03T23:20:40Z","published":"2024-04-03T23:20:40Z","title":"Many-to-many Image Generation with Auto-regressive Diffusion Models","summary":" Recent advancements in image generation have made significant progress, yet\nexisting models present limitations in perceiving and generating an arbitrary\nnumber of interrelated images within a broad context. This limitation becomes\nincreasingly critical as the demand for multi-image scenarios, such as\nmulti-view images and visual narratives, grows with the expansion of multimedia\nplatforms. This paper introduces a domain-general framework for many-to-many\nimage generation, capable of producing interrelated image series from a given\nset of images, offering a scalable solution that obviates the need for\ntask-specific solutions across different multi-image scenarios. To facilitate\nthis, we present MIS, a novel large-scale multi-image dataset, containing 12M\nsynthetic multi-image samples, each with 25 interconnected images. Utilizing\nStable Diffusion with varied latent noises, our method produces a set of\ninterconnected images from a single caption. Leveraging MIS, we learn M2M, an\nautoregressive model for many-to-many generation, where each image is modeled\nwithin a diffusion framework. Throughout training on the synthetic MIS, the\nmodel excels in capturing style and content from preceding images - synthetic\nor real - and generates novel images following the captured patterns.\nFurthermore, through task-specific fine-tuning, our model demonstrates its\nadaptability to various multi-image generation tasks, including Novel View\nSynthesis and Visual Procedure Generation.\n","authors":["Ying Shen","Yizhe Zhang","Shuangfei Zhai","Lifu Huang","Joshua M. Susskind","Jiatao Gu"],"pdf_url":"https://arxiv.org/pdf/2404.03109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03097v1","updated":"2024-04-03T22:38:54Z","published":"2024-04-03T22:38:54Z","title":"SalFoM: Dynamic Saliency Prediction with Video Foundation Models","summary":" Recent advancements in video saliency prediction (VSP) have shown promising\nperformance compared to the human visual system, whose emulation is the primary\ngoal of VSP. However, current state-of-the-art models employ spatio-temporal\ntransformers trained on limited amounts of data, hindering generalizability\nadaptation to downstream tasks. The benefits of vision foundation models\npresent a potential solution to improve the VSP process. However, adapting\nimage foundation models to the video domain presents significant challenges in\nmodeling scene dynamics and capturing temporal information. To address these\nchallenges, and as the first initiative to design a VSP model based on video\nfoundation models, we introduce SalFoM, a novel encoder-decoder video\ntransformer architecture. Our model employs UnMasked Teacher (UMT) as feature\nextractor and presents a heterogeneous decoder which features a locality-aware\nspatio-temporal transformer and integrates local and global spatio-temporal\ninformation from various perspectives to produce the final saliency map. Our\nqualitative and quantitative experiments on the challenging VSP benchmark\ndatasets of DHF1K, Hollywood-2 and UCF-Sports demonstrate the superiority of\nour proposed model in comparison with the state-of-the-art methods.\n","authors":["Morteza Moradi","Mohammad Moradi","Francesco Rundo","Concetto Spampinato","Ali Borji","Simone Palazzo"],"pdf_url":"https://arxiv.org/pdf/2404.03097v1.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.02460v2","updated":"2024-04-03T22:23:25Z","published":"2024-01-04T08:39:13Z","title":"Improved Zero-Shot Classification by Adapting VLMs with Text\n Descriptions","summary":" The zero-shot performance of existing vision-language models (VLMs) such as\nCLIP is limited by the availability of large-scale, aligned image and text\ndatasets in specific domains. In this work, we leverage two complementary\nsources of information -- descriptions of categories generated by large\nlanguage models (LLMs) and abundant, fine-grained image classification datasets\n-- to improve the zero-shot classification performance of VLMs across\nfine-grained domains. On the technical side, we develop methods to train VLMs\nwith this \"bag-level\" image-text supervision. We find that simply using these\nattributes at test-time does not improve performance, but our training\nstrategy, for example, on the iNaturalist dataset, leads to an average\nimprovement of 4-5% in zero-shot classification accuracy for novel categories\nof birds and flowers. Similar improvements are observed in domains where a\nsubset of the categories was used to fine-tune the model. By prompting LLMs in\nvarious ways, we generate descriptions that capture visual appearance, habitat,\nand geographic regions and pair them with existing attributes such as the\ntaxonomic structure of the categories. We systematically evaluate their ability\nto improve zero-shot categorization in natural domains. Our findings suggest\nthat geographic priors can be just as effective and are complementary to visual\nappearance. Our method also outperforms prior work on prompt-based tuning of\nVLMs. We release the benchmark, consisting of 14 datasets at\nhttps://github.com/cvl-umass/AdaptCLIPZS , which will contribute to future\nresearch in zero-shot recognition.\n","authors":["Oindrila Saha","Grant Van Horn","Subhransu Maji"],"pdf_url":"https://arxiv.org/pdf/2401.02460v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01988v2","updated":"2024-04-03T21:47:52Z","published":"2024-04-02T14:26:18Z","title":"Cooperative Students: Navigating Unsupervised Domain Adaptation in\n Nighttime Object Detection","summary":" Unsupervised Domain Adaptation (UDA) has shown significant advancements in\nobject detection under well-lit conditions; however, its performance degrades\nnotably in low-visibility scenarios, especially at night, posing challenges not\nonly for its adaptability in low signal-to-noise ratio (SNR) conditions but\nalso for the reliability and efficiency of automated vehicles. To address this\nproblem, we propose a \\textbf{Co}operative \\textbf{S}tudents (\\textbf{CoS})\nframework that innovatively employs global-local transformations (GLT) and a\nproxy-based target consistency (PTC) mechanism to capture the spatial\nconsistency in day- and night-time scenarios effectively, and thus bridge the\nsignificant domain shift across contexts. Building upon this, we further devise\nan adaptive IoU-informed thresholding (AIT) module to gradually avoid\noverlooking potential true positives and enrich the latent information in the\ntarget domain. Comprehensive experiments show that CoS essentially enhanced UDA\nperformance in low-visibility conditions and surpasses current state-of-the-art\ntechniques, achieving an increase in mAP of 3.0\\%, 1.9\\%, and 2.5\\% on BDD100K,\nSHIFT, and ACDC datasets, respectively. Code is available at\nhttps://github.com/jichengyuan/Cooperitive_Students.\n","authors":["Jicheng Yuan","Anh Le-Tuan","Manfred Hauswirth","Danh Le-Phuoc"],"pdf_url":"https://arxiv.org/pdf/2404.01988v2.pdf","comment":"Code is available at\n https://github.com/jichengyuan/Cooperitive_Students"},{"id":"http://arxiv.org/abs/2404.03070v1","updated":"2024-04-03T21:18:27Z","published":"2024-04-03T21:18:27Z","title":"Behind the Veil: Enhanced Indoor 3D Scene Reconstruction with Occluded\n Surfaces Completion","summary":" In this paper, we present a novel indoor 3D reconstruction method with\noccluded surface completion, given a sequence of depth readings. Prior\nstate-of-the-art (SOTA) methods only focus on the reconstruction of the visible\nareas in a scene, neglecting the invisible areas due to the occlusions, e.g.,\nthe contact surface between furniture, occluded wall and floor. Our method\ntackles the task of completing the occluded scene surfaces, resulting in a\ncomplete 3D scene mesh. The core idea of our method is learning 3D geometry\nprior from various complete scenes to infer the occluded geometry of an unseen\nscene from solely depth measurements. We design a coarse-fine hierarchical\noctree representation coupled with a dual-decoder architecture, i.e.,\nGeo-decoder and 3D Inpainter, which jointly reconstructs the complete 3D scene\ngeometry. The Geo-decoder with detailed representation at fine levels is\noptimized online for each scene to reconstruct visible surfaces. The 3D\nInpainter with abstract representation at coarse levels is trained offline\nusing various scenes to complete occluded surfaces. As a result, while the\nGeo-decoder is specialized for an individual scene, the 3D Inpainter can be\ngenerally applied across different scenes. We evaluate the proposed method on\nthe 3D Completed Room Scene (3D-CRS) and iTHOR datasets, significantly\noutperforming the SOTA methods by a gain of 16.8% and 24.2% in terms of the\ncompleteness of 3D reconstruction. 3D-CRS dataset including a complete 3D mesh\nof each scene is provided at project webpage.\n","authors":["Su Sun","Cheng Zhao","Yuliang Guo","Ruoyu Wang","Xinyu Huang","Yingjie Victor Chen","Liu Ren"],"pdf_url":"https://arxiv.org/pdf/2404.03070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03067v1","updated":"2024-04-03T21:16:19Z","published":"2024-04-03T21:16:19Z","title":"Self-supervised 6-DoF Robot Grasping by Demonstration via Augmented\n Reality Teleoperation System","summary":" Most existing 6-DoF robot grasping solutions depend on strong supervision on\ngrasp pose to ensure satisfactory performance, which could be laborious and\nimpractical when the robot works in some restricted area. To this end, we\npropose a self-supervised 6-DoF grasp pose detection framework via an Augmented\nReality (AR) teleoperation system that can efficiently learn human\ndemonstrations and provide 6-DoF grasp poses without grasp pose annotations.\nSpecifically, the system collects the human demonstration from the AR\nenvironment and contrastively learns the grasping strategy from the\ndemonstration. For the real-world experiment, the proposed system leads to\nsatisfactory grasping abilities and learning to grasp unknown objects within\nthree demonstrations.\n","authors":["Xiwen Dengxiong","Xueting Wang","Shi Bai","Yunbo Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.03067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03043v1","updated":"2024-04-03T20:05:00Z","published":"2024-04-03T20:05:00Z","title":"Linear Anchored Gaussian Mixture Model for Location and Width\n Computation of Objects in Thick Line Shape","summary":" An accurate detection of the centerlines of linear objects is a challenging\ntopic in many sensitive real-world applications such X-ray imaging, remote\nsensing and lane marking detection in road traffic. Model-based approaches\nusing Hough and Radon transforms are often used but, are not recommended for\nthick line detection, whereas approaches based on image derivatives need\nfurther step-by-step processing, making their efficiency dependent on each step\noutcomes. In this paper, we aim to detect linear structures found in images by\nconsidering the 3D representation of the image gray levels as a finite mixture\nmodel of statistical distribution. The latter, which we named linear anchored\nGaussian distribution could be parametrized by a scale value {\\sigma}\ndescribing the linear structure thickness and a line equation, parametrized, in\nturn, by a radius \\r{ho} and an orientation angle {\\theta}, describing the\nlinear structure centerline location. Expectation-Maximization (EM) algorithm\nis used for the mixture model parameter estimation, where a new paradigm, using\nthe background subtraction for the likelihood function computation, is\nproposed. For the EM algorithm, two {\\theta} parameter initialization schemes\nare used: the first one is based on a random choice of the first component of\n{\\theta} vector, whereas the second is based on the image Hessian with a\nsimultaneous computation of the mixture model components number. Experiments on\nreal world images and synthetic images corrupted by blur and additive noise\nshow the good performance of the proposed methods, where the algorithm using\nbackground subtraction and Hessian-based {\\theta} initialization provides an\noutstanding accuracy of the linear structure detection despite irregular image\nbackground and presence of blur and noise.\n","authors":["Nafaa Nacereddine","Djemel Ziou","Aicha Baya Goumeidane"],"pdf_url":"https://arxiv.org/pdf/2404.03043v1.pdf","comment":"13 pages, 13 figures"},{"id":"http://arxiv.org/abs/2311.05698v3","updated":"2024-04-03T20:04:49Z","published":"2023-11-09T19:15:12Z","title":"Mirasol3B: A Multimodal Autoregressive model for time-aligned and\n contextual modalities","summary":" One of the main challenges of multimodal learning is the need to combine\nheterogeneous modalities (e.g., video, audio, text). For example, video and\naudio are obtained at much higher rates than text and are roughly aligned in\ntime. They are often not synchronized with text, which comes as a global\ncontext, e.g., a title, or a description. Furthermore, video and audio inputs\nare of much larger volumes, and grow as the video length increases, which\nnaturally requires more compute dedicated to these modalities and makes\nmodeling of long-range dependencies harder.\n We here decouple the multimodal modeling, dividing it into separate, focused\nautoregressive models, processing the inputs according to the characteristics\nof the modalities. We propose a multimodal model, called Mirasol3B, consisting\nof an autoregressive component for the time-synchronized modalities (audio and\nvideo), and an autoregressive component for the context modalities which are\nnot necessarily aligned in time but are still sequential. To address the\nlong-sequences of the video-audio inputs, we propose to further partition the\nvideo and audio sequences in consecutive snippets and autoregressively process\ntheir representations. To that end, we propose a Combiner mechanism, which\nmodels the audio-video information jointly within a timeframe. The Combiner\nlearns to extract audio and video features from raw spatio-temporal signals,\nand then learns to fuse these features producing compact but expressive\nrepresentations per snippet.\n Our approach achieves the state-of-the-art on well established multimodal\nbenchmarks, outperforming much larger models. It effectively addresses the high\ncomputational demand of media inputs by both learning compact representations,\ncontrolling the sequence length of the audio-video feature representations, and\nmodeling their dependencies in time.\n","authors":["AJ Piergiovanni","Isaac Noble","Dahun Kim","Michael S. Ryoo","Victor Gomes","Anelia Angelova"],"pdf_url":"https://arxiv.org/pdf/2311.05698v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03042v1","updated":"2024-04-03T20:04:44Z","published":"2024-04-03T20:04:44Z","title":"AWOL: Analysis WithOut synthesis using Language","summary":" Many classical parametric 3D shape models exist, but creating novel shapes\nwith such models requires expert knowledge of their parameters. For example,\nimagine creating a specific type of tree using procedural graphics or a new\nkind of animal from a statistical shape model. Our key idea is to leverage\nlanguage to control such existing models to produce novel shapes. This involves\nlearning a mapping between the latent space of a vision-language model and the\nparameter space of the 3D model, which we do using a small set of shape and\ntext pairs. Our hypothesis is that mapping from language to parameters allows\nus to generate parameters for objects that were never seen during training. If\nthe mapping between language and parameters is sufficiently smooth, then\ninterpolation or generalization in language should translate appropriately into\nnovel 3D shapes. We test our approach with two very different types of\nparametric shape models (quadrupeds and arboreal trees). We use a learned\nstatistical shape model of quadrupeds and show that we can use text to generate\nnew animals not present during training. In particular, we demonstrate\nstate-of-the-art shape estimation of 3D dogs. This work also constitutes the\nfirst language-driven method for generating 3D trees. Finally, embedding images\nin the CLIP latent space enables us to generate animals and trees directly from\nimages.\n","authors":["Silvia Zuffi","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2404.03042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02132v2","updated":"2024-04-03T19:45:02Z","published":"2024-04-02T17:40:29Z","title":"ViTamin: Designing Scalable Vision Models in the Vision-Language Era","summary":" Recent breakthroughs in vision-language models (VLMs) start a new page in the\nvision community. The VLMs provide stronger and more generalizable feature\nembeddings compared to those from ImageNet-pretrained models, thanks to the\ntraining on the large-scale Internet image-text pairs. However, despite the\namazing achievement from the VLMs, vanilla Vision Transformers (ViTs) remain\nthe default choice for the image encoder. Although pure transformer proves its\neffectiveness in the text encoding area, it remains questionable whether it is\nalso the case for image encoding, especially considering that various types of\nnetworks are proposed on the ImageNet benchmark, which, unfortunately, are\nrarely studied in VLMs. Due to small data/model scale, the original conclusions\nof model design on ImageNet can be limited and biased. In this paper, we aim at\nbuilding an evaluation protocol of vision models in the vision-language era\nunder the contrastive language-image pretraining (CLIP) framework. We provide a\ncomprehensive way to benchmark different vision models, covering their\nzero-shot performance and scalability in both model and training data sizes. To\nthis end, we introduce ViTamin, a new vision models tailored for VLMs.\nViTamin-L significantly outperforms ViT-L by 2.0% ImageNet zero-shot accuracy,\nwhen using the same publicly available DataComp-1B dataset and the same\nOpenCLIP training scheme. ViTamin-L presents promising results on 60 diverse\nbenchmarks, including classification, retrieval, open-vocabulary detection and\nsegmentation, and large multi-modal models. When further scaling up the model\nsize, our ViTamin-XL with only 436M parameters attains 82.9% ImageNet zero-shot\naccuracy, surpassing 82.0% achieved by EVA-E that has ten times more parameters\n(4.4B).\n","authors":["Jieneng Chen","Qihang Yu","Xiaohui Shen","Alan Yuille","Liang-Chieh Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02132v2.pdf","comment":"CVPR 2024; https://github.com/Beckschen/ViTamin"},{"id":"http://arxiv.org/abs/2404.03022v1","updated":"2024-04-03T19:17:43Z","published":"2024-04-03T19:17:43Z","title":"BCAmirs at SemEval-2024 Task 4: Beyond Words: A Multimodal and\n Multilingual Exploration of Persuasion in Memes","summary":" Memes, combining text and images, frequently use metaphors to convey\npersuasive messages, shaping public opinion. Motivated by this, our team\nengaged in SemEval-2024 Task 4, a hierarchical multi-label classification task\ndesigned to identify rhetorical and psychological persuasion techniques\nembedded within memes. To tackle this problem, we introduced a caption\ngeneration step to assess the modality gap and the impact of additional\nsemantic information from images, which improved our result. Our best model\nutilizes GPT-4 generated captions alongside meme text to fine-tune RoBERTa as\nthe text encoder and CLIP as the image encoder. It outperforms the baseline by\na large margin in all 12 subtasks. In particular, it ranked in top-3 across all\nlanguages in Subtask 2a, and top-4 in Subtask 2b, demonstrating quantitatively\nstrong performance. The improvement achieved by the introduced intermediate\nstep is likely attributable to the metaphorical essence of images that\nchallenges visual encoders. This highlights the potential for improving\nabstract visual semantics encoding.\n","authors":["Amirhossein Abaskohi","Amirhossein Dabiriaghdam","Lele Wang","Giuseppe Carenini"],"pdf_url":"https://arxiv.org/pdf/2404.03022v1.pdf","comment":"11 pages, 5 tables, 2 figures, Proceedings of the 18th International\n Workshop on Semantic Evaluation (SemEval-2024) @ NAACL 2024"},{"id":"http://arxiv.org/abs/2306.08103v4","updated":"2024-04-03T19:16:02Z","published":"2023-06-13T19:48:56Z","title":"Generating Images with 3D Annotations Using Diffusion Models","summary":" Diffusion models have emerged as a powerful generative method, capable of\nproducing stunning photo-realistic images from natural language descriptions.\nHowever, these models lack explicit control over the 3D structure in the\ngenerated images. Consequently, this hinders our ability to obtain detailed 3D\nannotations for the generated images or to craft instances with specific poses\nand distances. In this paper, we propose 3D Diffusion Style Transfer (3D-DST),\nwhich incorporates 3D geometry control into diffusion models. Our method\nexploits ControlNet, which extends diffusion models by using visual prompts in\naddition to text prompts. We generate images of the 3D objects taken from 3D\nshape repositories (e.g., ShapeNet and Objaverse), render them from a variety\nof poses and viewing directions, compute the edge maps of the rendered images,\nand use these edge maps as visual prompts to generate realistic images. With\nexplicit 3D geometry control, we can easily change the 3D structures of the\nobjects in the generated images and obtain ground-truth 3D annotations\nautomatically. This allows us to improve a wide range of vision tasks, e.g.,\nclassification and 3D pose estimation, in both in-distribution (ID) and\nout-of-distribution (OOD) settings. We demonstrate the effectiveness of our\nmethod through extensive experiments on ImageNet-100/200, ImageNet-R,\nPASCAL3D+, ObjectNet3D, and OOD-CV. The results show that our method\nsignificantly outperforms existing methods, e.g., 3.8 percentage points on\nImageNet-100 using DeiT-B.\n","authors":["Wufei Ma","Qihao Liu","Jiahao Wang","Angtian Wang","Xiaoding Yuan","Yi Zhang","Zihao Xiao","Guofeng Zhang","Beijia Lu","Ruxiao Duan","Yongrui Qi","Adam Kortylewski","Yaoyao Liu","Alan Yuille"],"pdf_url":"https://arxiv.org/pdf/2306.08103v4.pdf","comment":"ICLR 2024 Spotlight. Code: https://ccvl.jhu.edu/3D-DST/"},{"id":"http://arxiv.org/abs/2404.03015v1","updated":"2024-04-03T18:54:27Z","published":"2024-04-03T18:54:27Z","title":"DPFT: Dual Perspective Fusion Transformer for Camera-Radar-based Object\n Detection","summary":" The perception of autonomous vehicles has to be efficient, robust, and\ncost-effective. However, cameras are not robust against severe weather\nconditions, lidar sensors are expensive, and the performance of radar-based\nperception is still inferior to the others. Camera-radar fusion methods have\nbeen proposed to address this issue, but these are constrained by the typical\nsparsity of radar point clouds and often designed for radars without elevation\ninformation. We propose a novel camera-radar fusion approach called Dual\nPerspective Fusion Transformer (DPFT), designed to overcome these limitations.\nOur method leverages lower-level radar data (the radar cube) instead of the\nprocessed point clouds to preserve as much information as possible and employs\nprojections in both the camera and ground planes to effectively use radars with\nelevation information and simplify the fusion with camera data. As a result,\nDPFT has demonstrated state-of-the-art performance on the K-Radar dataset while\nshowing remarkable robustness against adverse weather conditions and\nmaintaining a low inference time. The code is made available as open-source\nsoftware under https://github.com/TUMFTM/DPFT.\n","authors":["Felix Fent","Andras Palffy","Holger Caesar"],"pdf_url":"https://arxiv.org/pdf/2404.03015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03010v1","updated":"2024-04-03T18:42:19Z","published":"2024-04-03T18:42:19Z","title":"Skeleton Recall Loss for Connectivity Conserving and Resource Efficient\n Segmentation of Thin Tubular Structures","summary":" Accurately segmenting thin tubular structures, such as vessels, nerves, roads\nor concrete cracks, is a crucial task in computer vision. Standard deep\nlearning-based segmentation loss functions, such as Dice or Cross-Entropy,\nfocus on volumetric overlap, often at the expense of preserving structural\nconnectivity or topology. This can lead to segmentation errors that adversely\naffect downstream tasks, including flow calculation, navigation, and structural\ninspection. Although current topology-focused losses mark an improvement, they\nintroduce significant computational and memory overheads. This is particularly\nrelevant for 3D data, rendering these losses infeasible for larger volumes as\nwell as increasingly important multi-class segmentation problems. To mitigate\nthis, we propose a novel Skeleton Recall Loss, which effectively addresses\nthese challenges by circumventing intensive GPU-based calculations with\ninexpensive CPU operations. It demonstrates overall superior performance to\ncurrent state-of-the-art approaches on five public datasets for\ntopology-preserving segmentation, while substantially reducing computational\noverheads by more than 90%. In doing so, we introduce the first multi-class\ncapable loss function for thin structure segmentation, excelling in both\nefficiency and efficacy for topology-preservation.\n","authors":["Yannick Kirchhoff","Maximilian R. Rokuss","Saikat Roy","Balint Kovacs","Constantin Ulrich","Tassilo Wald","Maximilian Zenk","Philipp Vollmuth","Jens Kleesiek","Fabian Isensee","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2404.03010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02999v1","updated":"2024-04-03T18:40:48Z","published":"2024-04-03T18:40:48Z","title":"MeshBrush: Painting the Anatomical Mesh with Neural Stylization for\n Endoscopy","summary":" Style transfer is a promising approach to close the sim-to-real gap in\nmedical endoscopy. Rendering realistic endoscopic videos by traversing\npre-operative scans (such as MRI or CT) can generate realistic simulations as\nwell as ground truth camera poses and depth maps. Although image-to-image (I2I)\ntranslation models such as CycleGAN perform well, they are unsuitable for\nvideo-to-video synthesis due to the lack of temporal consistency, resulting in\nartifacts between frames. We propose MeshBrush, a neural mesh stylization\nmethod to synthesize temporally consistent videos with differentiable\nrendering. MeshBrush uses the underlying geometry of patient imaging data while\nleveraging existing I2I methods. With learned per-vertex textures, the stylized\nmesh guarantees consistency while producing high-fidelity outputs. We\ndemonstrate that mesh stylization is a promising approach for creating\nrealistic simulations for downstream tasks such as training and preoperative\nplanning. Although our method is tested and designed for ureteroscopy, its\ncomponents are transferable to general endoscopic and laparoscopic procedures.\n","authors":["John J. Han","Ayberk Acar","Nicholas Kavoussi","Jie Ying Wu"],"pdf_url":"https://arxiv.org/pdf/2404.02999v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.02990v1","updated":"2024-04-03T18:20:41Z","published":"2024-04-03T18:20:41Z","title":"ASAP: Interpretable Analysis and Summarization of AI-generated Image\n Patterns at Scale","summary":" Generative image models have emerged as a promising technology to produce\nrealistic images. Despite potential benefits, concerns grow about its misuse,\nparticularly in generating deceptive images that could raise significant\nethical, legal, and societal issues. Consequently, there is growing demand to\nempower users to effectively discern and comprehend patterns of AI-generated\nimages. To this end, we developed ASAP, an interactive visualization system\nthat automatically extracts distinct patterns of AI-generated images and allows\nusers to interactively explore them via various views. To uncover fake\npatterns, ASAP introduces a novel image encoder, adapted from CLIP, which\ntransforms images into compact \"distilled\" representations, enriched with\ninformation for differentiating authentic and fake images. These\nrepresentations generate gradients that propagate back to the attention maps of\nCLIP's transformer block. This process quantifies the relative importance of\neach pixel to image authenticity or fakeness, exposing key deceptive patterns.\nASAP enables the at scale interactive analysis of these patterns through\nmultiple, coordinated visualizations. This includes a representation overview\nwith innovative cell glyphs to aid in the exploration and qualitative\nevaluation of fake patterns across a vast array of images, as well as a pattern\nview that displays authenticity-indicating patterns in images and quantifies\ntheir impact. ASAP supports the analysis of cutting-edge generative models with\nthe latest architectures, including GAN-based models like proGAN and diffusion\nmodels like the latent diffusion model. We demonstrate ASAP's usefulness\nthrough two usage scenarios using multiple fake image detection benchmark\ndatasets, revealing its ability to identify and understand hidden patterns in\nAI-generated images, especially in detecting fake human faces produced by\ndiffusion-based techniques.\n","authors":["Jinbin Huang","Chen Chen","Aditi Mishra","Bum Chul Kwon","Zhicheng Liu","Chris Bryan"],"pdf_url":"https://arxiv.org/pdf/2404.02990v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.01734v2","updated":"2024-04-03T18:11:54Z","published":"2023-12-04T08:55:46Z","title":"Effective Adapter for Face Recognition in the Wild","summary":" In this paper, we tackle the challenge of face recognition in the wild, where\nimages often suffer from low quality and real-world distortions. Traditional\nheuristic approaches-either training models directly on these degraded images\nor their enhanced counterparts using face restoration techniques-have proven\nineffective, primarily due to the degradation of facial features and the\ndiscrepancy in image domains. To overcome these issues, we propose an effective\nadapter for augmenting existing face recognition models trained on high-quality\nfacial datasets. The key of our adapter is to process both the unrefined and\nenhanced images using two similar structures, one fixed and the other\ntrainable. Such design can confer two benefits. First, the dual-input system\nminimizes the domain gap while providing varied perspectives for the face\nrecognition model, where the enhanced image can be regarded as a complex\nnon-linear transformation of the original one by the restoration model. Second,\nboth two similar structures can be initialized by the pre-trained models\nwithout dropping the past knowledge. The extensive experiments in zero-shot\nsettings show the effectiveness of our method by surpassing baselines of about\n3%, 4%, and 7% in three datasets. Our code will be publicly available.\n","authors":["Yunhao Liu","Yu-Ju Tsai","Kelvin C. K. Chan","Xiangtai Li","Lu Qi","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.01734v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02973v1","updated":"2024-04-03T18:00:36Z","published":"2024-04-03T18:00:36Z","title":"Scaling Laws for Galaxy Images","summary":" We present the first systematic investigation of supervised scaling laws\noutside of an ImageNet-like context - on images of galaxies. We use 840k galaxy\nimages and over 100M annotations by Galaxy Zoo volunteers, comparable in scale\nto Imagenet-1K. We find that adding annotated galaxy images provides a power\nlaw improvement in performance across all architectures and all tasks, while\nadding trainable parameters is effective only for some (typically more\nsubjectively challenging) tasks. We then compare the downstream performance of\nfinetuned models pretrained on either ImageNet-12k alone vs. additionally\npretrained on our galaxy images. We achieve an average relative error rate\nreduction of 31% across 5 downstream tasks of scientific interest. Our\nfinetuned models are more label-efficient and, unlike their\nImageNet-12k-pretrained equivalents, often achieve linear transfer performance\nequal to that of end-to-end finetuning. We find relatively modest additional\ndownstream benefits from scaling model size, implying that scaling alone is not\nsufficient to address our domain gap, and suggest that practitioners with\nqualitatively different images might benefit more from in-domain adaption\nfollowed by targeted downstream labelling.\n","authors":["Mike Walmsley","Micah Bowles","Anna M. M. Scaife","Jason Shingirai Makechemu","Alexander J. Gordon","Annette M. N. Ferguson","Robert G. Mann","James Pearson","Jürgen J. Popp","Jo Bovy","Josh Speagle","Hugh Dickinson","Lucy Fortson","Tobias Géron","Sandor Kruk","Chris J. Lintott","Kameswara Mantha","Devina Mohan","David O'Ryan","Inigo V. Slijepevic"],"pdf_url":"https://arxiv.org/pdf/2404.02973v1.pdf","comment":"10+6 pages, 12 figures. Appendix C2 based on arxiv:2206.11927. Code,\n demos, documentation at https://github.com/mwalmsley/zoobot"}]},"2024-04-04T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2404.03634v1","updated":"2024-04-04T17:54:12Z","published":"2024-04-04T17:54:12Z","title":"PreAfford: Universal Affordance-Based Pre-Grasping for Diverse Objects\n and Environments","summary":" Robotic manipulation of ungraspable objects with two-finger grippers presents\nsignificant challenges due to the paucity of graspable features, while\ntraditional pre-grasping techniques, which rely on repositioning objects and\nleveraging external aids like table edges, lack the adaptability across object\ncategories and scenes. Addressing this, we introduce PreAfford, a novel\npre-grasping planning framework that utilizes a point-level affordance\nrepresentation and a relay training approach to enhance adaptability across a\nbroad range of environments and object types, including those previously\nunseen. Demonstrated on the ShapeNet-v2 dataset, PreAfford significantly\nimproves grasping success rates by 69% and validates its practicality through\nreal-world experiments. This work offers a robust and adaptable solution for\nmanipulating ungraspable objects.\n","authors":["Kairui Ding","Boyuan Chen","Ruihai Wu","Yuyang Li","Zongzheng Zhang","Huan-ang Gao","Siqi Li","Yixin Zhu","Guyue Zhou","Hao Dong","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.03634v1.pdf","comment":"Project Page: https://air-discover.github.io/PreAfford/"},{"id":"http://arxiv.org/abs/2404.03629v1","updated":"2024-04-04T17:49:38Z","published":"2024-04-04T17:49:38Z","title":"ROBUST: 221 Bugs in the Robot Operating System","summary":" As robotic systems such as autonomous cars and delivery drones assume greater\nroles and responsibilities within society, the likelihood and impact of\ncatastrophic software failure within those systems is increased.To aid\nresearchers in the development of new methods to measure and assure the safety\nand quality of robotics software, we systematically curated a dataset of 221\nbugs across 7 popular and diverse software systems implemented via the Robot\nOperating System (ROS). We produce historically accurate recreations of each of\nthe 221 defective software versions in the form of Docker images, and use a\ngrounded theory approach to examine and categorize their corresponding faults,\nfailures, and fixes. Finally, we reflect on the implications of our findings\nand outline future research directions for the community.\n","authors":["Christopher S. Timperley","Gijs van der Hoorn","André Santos","Harshavardhan Deshpande","Andrzej Wąsowski"],"pdf_url":"https://arxiv.org/pdf/2404.03629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03587v1","updated":"2024-04-04T16:52:48Z","published":"2024-04-04T16:52:48Z","title":"Anticipate & Collab: Data-driven Task Anticipation and Knowledge-driven\n Planning for Human-robot Collaboration","summary":" An agent assisting humans in daily living activities can collaborate more\neffectively by anticipating upcoming tasks. Data-driven methods represent the\nstate of the art in task anticipation, planning, and related problems, but\nthese methods are resource-hungry and opaque. Our prior work introduced a proof\nof concept framework that used an LLM to anticipate 3 high-level tasks that\nserved as goals for a classical planning system that computed a sequence of\nlow-level actions for the agent to achieve these goals. This paper describes\nDaTAPlan, our framework that significantly extends our prior work toward\nhuman-robot collaboration. Specifically, DaTAPlan planner computes actions for\nan agent and a human to collaboratively and jointly achieve the tasks\nanticipated by the LLM, and the agent automatically adapts to unexpected\nchanges in human action outcomes and preferences. We evaluate DaTAPlan\ncapabilities in a realistic simulation environment, demonstrating accurate task\nanticipation, effective human-robot collaboration, and the ability to adapt to\nunexpected changes. Project website: https://dataplan-hrc.github.io\n","authors":["Shivam Singh","Karthik Swaminathan","Raghav Arora","Ramandeep Singh","Ahana Datta","Dipanjan Das","Snehasis Banerjee","Mohan Sridharan","Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.03587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03570v1","updated":"2024-04-04T16:30:20Z","published":"2024-04-04T16:30:20Z","title":"Embodied AI with Two Arms: Zero-shot Learning, Safety and Modularity","summary":" We present an embodied AI system which receives open-ended natural language\ninstructions from a human, and controls two arms to collaboratively accomplish\npotentially long-horizon tasks over a large workspace. Our system is modular:\nit deploys state of the art Large Language Models for task\nplanning,Vision-Language models for semantic perception, and Point Cloud\ntransformers for grasping. With semantic and physical safety in mind, these\nmodules are interfaced with a real-time trajectory optimizer and a compliant\ntracking controller to enable human-robot proximity. We demonstrate performance\nfor the following tasks: bi-arm sorting, bottle opening, and trash disposal\ntasks. These are done zero-shot where the models used have not been trained\nwith any real world data from this bi-arm robot, scenes or workspace.Composing\nboth learning- and non-learning-based components in a modular fashion with\ninterpretable inputs and outputs allows the user to easily debug points of\nfailures and fragilities. One may also in-place swap modules to improve the\nrobustness of the overall platform, for instance with imitation-learned\npolicies.\n","authors":["Jake Varley","Sumeet Singh","Deepali Jain","Krzysztof Choromanski","Andy Zeng","Somnath Basu Roy Chowdhury","Avinava Dubey","Vikas Sindhwani"],"pdf_url":"https://arxiv.org/pdf/2404.03570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03567v1","updated":"2024-04-04T16:25:23Z","published":"2024-04-04T16:25:23Z","title":"Factored Task and Motion Planning with Combined Optimization, Sampling\n and Learning","summary":" In this thesis, we aim to improve the performance of TAMP algorithms from\nthree complementary perspectives. First, we investigate the integration of\ndiscrete task planning with continuous trajectory optimization. Our main\ncontribution is a conflict-based solver that automatically discovers why a task\nplan might fail when considering the constraints of the physical world. This\ninformation is then fed back into the task planner, resulting in an efficient,\nbidirectional, and intuitive interface between task and motion, capable of\nsolving TAMP problems with multiple objects, robots, and tight physical\nconstraints. In the second part, we first illustrate that, given the wide range\nof tasks and environments within TAMP, neither sampling nor optimization is\nsuperior in all settings. To combine the strengths of both approaches, we have\ndesigned meta-solvers for TAMP, adaptive solvers that automatically select\nwhich algorithms and computations to use and how to best decompose each problem\nto find a solution faster. In the third part, we combine deep learning\narchitectures with model-based reasoning to accelerate computations within our\nTAMP solver. Specifically, we target infeasibility detection and nonlinear\noptimization, focusing on generalization, accuracy, compute time, and data\nefficiency. At the core of our contributions is a refined, factored\nrepresentation of the trajectory optimization problems inside TAMP. This\nstructure not only facilitates more efficient planning, encoding of geometric\ninfeasibility, and meta-reasoning but also provides better generalization in\nneural architectures.\n","authors":["Joaquim Ortiz-Haro"],"pdf_url":"https://arxiv.org/pdf/2404.03567v1.pdf","comment":"PhD Thesis, TU Berlin"},{"id":"http://arxiv.org/abs/2404.03556v1","updated":"2024-04-04T16:07:21Z","published":"2024-04-04T16:07:21Z","title":"Robot Safety Monitoring using Programmable Light Curtains","summary":" As factories continue to evolve into collaborative spaces with multiple\nrobots working together with human supervisors in the loop, ensuring safety for\nall actors involved becomes critical. Currently, laser-based light curtain\nsensors are widely used in factories for safety monitoring. While these\nconventional safety sensors meet high accuracy standards, they are difficult to\nreconfigure and can only monitor a fixed user-defined region of space.\nFurthermore, they are typically expensive. Instead, we leverage a controllable\ndepth sensor, programmable light curtains (PLC), to develop an inexpensive and\nflexible real-time safety monitoring system for collaborative robot workspaces.\nOur system projects virtual dynamic safety envelopes that tightly envelop the\nmoving robot at all times and detect any objects that intrude the envelope.\nFurthermore, we develop an instrumentation algorithm that optimally places\n(multiple) PLCs in a workspace to maximize the visibility coverage of robots.\nOur work enables fence-less human-robot collaboration, while scaling to monitor\nmultiple robots with few sensors. We analyze our system in a real manufacturing\ntestbed with four robot arms and demonstrate its capabilities as a fast,\naccurate, and inexpensive safety monitoring solution.\n","authors":["Karnik Ram","Shobhit Aggarwal","Robert Tamburo","Siddharth Ancha","Srinivasa Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2404.03556v1.pdf","comment":"Under review for IROS '24. Webpage\n http://cmu-mfi.github.io/plc-safety"},{"id":"http://arxiv.org/abs/2404.03498v1","updated":"2024-04-04T14:56:41Z","published":"2024-04-04T14:56:41Z","title":"Integrating Large Language Models with Multimodal Virtual Reality\n Interfaces to Support Collaborative Human-Robot Construction Work","summary":" In the construction industry, where work environments are complex,\nunstructured and often dangerous, the implementation of Human-Robot\nCollaboration (HRC) is emerging as a promising advancement. This underlines the\ncritical need for intuitive communication interfaces that enable construction\nworkers to collaborate seamlessly with robotic assistants. This study\nintroduces a conversational Virtual Reality (VR) interface integrating\nmultimodal interaction to enhance intuitive communication between construction\nworkers and robots. By integrating voice and controller inputs with the Robot\nOperating System (ROS), Building Information Modeling (BIM), and a game engine\nfeaturing a chat interface powered by a Large Language Model (LLM), the\nproposed system enables intuitive and precise interaction within a VR setting.\nEvaluated by twelve construction workers through a drywall installation case\nstudy, the proposed system demonstrated its low workload and high usability\nwith succinct command inputs. The proposed multimodal interaction system\nsuggests that such technological integration can substantially advance the\nintegration of robotic assistants in the construction industry.\n","authors":["Somin Park","Carol C. Menassa","Vineet R. Kamat"],"pdf_url":"https://arxiv.org/pdf/2404.03498v1.pdf","comment":"39 pages, 16 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.03493v1","updated":"2024-04-04T14:48:26Z","published":"2024-04-04T14:48:26Z","title":"A Methodology to Study the Impact of Spiking Neural Network Parameters\n considering Event-Based Automotive Data","summary":" Autonomous Driving (AD) systems are considered as the future of human\nmobility and transportation. Solving computer vision tasks such as image\nclassification and object detection/segmentation, with high accuracy and low\npower/energy consumption, is highly needed to realize AD systems in real life.\nThese requirements can potentially be satisfied by Spiking Neural Networks\n(SNNs). However, the state-of-the-art works in SNN-based AD systems still focus\non proposing network models that can achieve high accuracy, and they have not\nsystematically studied the roles of SNN parameters when used for learning\nevent-based automotive data. Therefore, we still lack understanding of how to\neffectively develop SNN models for AD systems. Toward this, we propose a novel\nmethodology to systematically study and analyze the impact of SNN parameters\nconsidering event-based automotive data, then leverage this analysis for\nenhancing SNN developments. To do this, we first explore different settings of\nSNN parameters that directly affect the learning mechanism (i.e., batch size,\nlearning rate, neuron threshold potential, and weight decay), then analyze the\naccuracy results. Afterward, we propose techniques that jointly improve SNN\naccuracy and reduce training time. Experimental results show that our\nmethodology can improve the SNN models for AD systems than the\nstate-of-the-art, as it achieves higher accuracy (i.e., 86%) for the NCARS\ndataset, and it can also achieve iso-accuracy (i.e., ~85% with standard\ndeviation less than 0.5%) while speeding up the training time by 1.9x. In this\nmanner, our research work provides a set of guidelines for SNN parameter\nenhancements, thereby enabling the practical developments of SNN-based AD\nsystems.\n","authors":["Iqra Bano","Rachmad Vidya Wicaksana Putra","Alberto Marchisio","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2404.03493v1.pdf","comment":"7 pages, 13 figures, 1 table"},{"id":"http://arxiv.org/abs/2404.03489v1","updated":"2024-04-04T14:43:43Z","published":"2024-04-04T14:43:43Z","title":"Design of Stickbug: a Six-Armed Precision Pollination Robot","summary":" This work presents the design of Stickbug, a six-armed, multi-agent,\nprecision pollination robot that combines the accuracy of single-agent systems\nwith swarm parallelization in greenhouses. Precision pollination robots have\noften been proposed to offset the effects of a decreasing population of natural\npollinators, but they frequently lack the required parallelization and\nscalability. Stickbug achieves this by allowing each arm and drive base to act\nas an individual agent, significantly reducing planning complexity. Stickbug\nuses a compact holonomic Kiwi drive to navigate narrow greenhouse rows, a tall\nmast to support multiple manipulators and reach plant heights, a detection\nmodel and classifier to identify Bramble flowers, and a felt-tipped\nend-effector for contact-based pollination. Initial experimental validation\ndemonstrates that Stickbug can attempt over 1.5 pollinations per minute with a\n50% success rate. Additionally, a Bramble flower perception dataset was created\nand is publicly available alongside Stickbug's software and design files.\n","authors":["Trevor Smith","Madhav Rijal","Christopher Tatsch","R. Michael Butts","Jared Beard","R. Tyler Cook","Andy Chu","Jason Gross","Yu Gu"],"pdf_url":"https://arxiv.org/pdf/2404.03489v1.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.03462v1","updated":"2024-04-04T14:13:56Z","published":"2024-04-04T14:13:56Z","title":"You Only Scan Once: A Dynamic Scene Reconstruction Pipeline for 6-DoF\n Robotic Grasping of Novel Objects","summary":" In the realm of robotic grasping, achieving accurate and reliable\ninteractions with the environment is a pivotal challenge. Traditional methods\nof grasp planning methods utilizing partial point clouds derived from depth\nimage often suffer from reduced scene understanding due to occlusion,\nultimately impeding their grasping accuracy. Furthermore, scene reconstruction\nmethods have primarily relied upon static techniques, which are susceptible to\nenvironment change during manipulation process limits their efficacy in\nreal-time grasping tasks. To address these limitations, this paper introduces a\nnovel two-stage pipeline for dynamic scene reconstruction. In the first stage,\nour approach takes scene scanning as input to register each target object with\nmesh reconstruction and novel object pose tracking. In the second stage, pose\ntracking is still performed to provide object poses in real-time, enabling our\napproach to transform the reconstructed object point clouds back into the\nscene. Unlike conventional methodologies, which rely on static scene snapshots,\nour method continuously captures the evolving scene geometry, resulting in a\ncomprehensive and up-to-date point cloud representation. By circumventing the\nconstraints posed by occlusion, our method enhances the overall grasp planning\nprocess and empowers state-of-the-art 6-DoF robotic grasping algorithms to\nexhibit markedly improved accuracy.\n","authors":["Lei Zhou","Haozhe Wang","Zhengshen Zhang","Zhiyang Liu","Francis EH Tay","adn Marcelo H. Ang. Jr"],"pdf_url":"https://arxiv.org/pdf/2404.03462v1.pdf","comment":"ICRA 2024"},{"id":"http://arxiv.org/abs/2404.03444v1","updated":"2024-04-04T13:44:41Z","published":"2024-04-04T13:44:41Z","title":"Simultaneous State Estimation and Contact Detection for Legged Robots by\n Multiple-Model Kalman Filtering","summary":" This paper proposes an algorithm for combined contact detection and state\nestimation for legged robots. The proposed algorithm models the robot's\nmovement as a switched system, in which different modes relate to different\nfeet being in contact with the ground. The key element in the proposed\nalgorithm is an interacting multiple-model Kalman filter, which identifies the\ncurrently-active mode defining contacts, while estimating the state. The\nrationale for the proposed estimation framework is that contacts (and contact\nforces) impact the robot's state and vice versa. This paper presents validation\nstudies with a quadruped using (i) the high-fidelity simulator Gazebo for a\ncomparison with ground truth values and a baseline estimator, and (ii) hardware\nexperiments with the Unitree A1 robot. The simulation study shows that the\nproposed algorithm outperforms the baseline estimator, which does not\nsimultaneous detect contacts. The hardware experiments showcase the\napplicability of the proposed algorithm and highlights the ability to detect\ncontacts.\n","authors":["Marcel Menner","Karl Berntorp"],"pdf_url":"https://arxiv.org/pdf/2404.03444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03427v1","updated":"2024-04-04T13:13:47Z","published":"2024-04-04T13:13:47Z","title":"GMMCalib: Extrinsic Calibration of LiDAR Sensors using GMM-based Joint\n Registration","summary":" State-of-the-art LiDAR calibration frameworks mainly use non-probabilistic\nregistration methods such as Iterative Closest Point (ICP) and its variants.\nThese methods suffer from biased results due to their pair-wise registration\nprocedure as well as their sensitivity to initialization and parameterization.\nThis often leads to misalignments in the calibration process. Probabilistic\nregistration methods compensate for these drawbacks by specifically modeling\nthe probabilistic nature of the observations. This paper presents GMMCalib, an\nautomatic target-based extrinsic calibration approach for multi-LiDAR systems.\nUsing an implementation of a Gaussian Mixture Model (GMM)-based registration\nmethod that allows joint registration of multiple point clouds, this\ndata-driven approach is compared to ICP algorithms. We perform simulation\nexperiments using the digital twin of the EDGAR research vehicle and validate\nthe results in a real-world environment. We also address the local minima\nproblem of local registration methods for extrinsic sensor calibration and use\na distance-based metric to evaluate the calibration results. Our results show\nthat an increase in robustness against sensor miscalibrations can be achieved\nby using GMM-based registration algorithms. The code is open source and\navailable on GitHub.\n","authors":["Ilir Tahiraj","Felix Fent","Philipp Hafemann","Egon Ye","Markus Lienkamp"],"pdf_url":"https://arxiv.org/pdf/2404.03427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03415v1","updated":"2024-04-04T12:49:42Z","published":"2024-04-04T12:49:42Z","title":"Future Predictive Success-or-Failure Classification for Long-Horizon\n Robotic Tasks","summary":" Automating long-horizon tasks with a robotic arm has been a central research\ntopic in robotics. Optimization-based action planning is an efficient approach\nfor creating an action plan to complete a given task. Construction of a\nreliable planning method requires a design process of conditions, e.g., to\navoid collision between objects. The design process, however, has two critical\nissues: 1) iterative trials--the design process is time-consuming due to the\ntrial-and-error process of modifying conditions, and 2) manual redesign--it is\ndifficult to cover all the necessary conditions manually. To tackle these\nissues, this paper proposes a future-predictive\nsuccess-or-failure-classification method to obtain conditions automatically.\nThe key idea behind the proposed method is an end-to-end approach for\ndetermining whether the action plan can complete a given task instead of\nmanually redesigning the conditions. The proposed method uses a long-horizon\nfuture-prediction method to enable success-or-failure classification without\nthe execution of an action plan. This paper also proposes a regularization term\ncalled transition consistency regularization to provide easy-to-predict feature\ndistribution. The regularization term improves future prediction and\nclassification performance. The effectiveness of our method is demonstrated\nthrough classification and robotic-manipulation experiments.\n","authors":["Naoya Sogi","Hiroyuki Oyama","Takashi Shibata","Makoto Terao"],"pdf_url":"https://arxiv.org/pdf/2404.03415v1.pdf","comment":"IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.03412v1","updated":"2024-04-04T12:45:49Z","published":"2024-04-04T12:45:49Z","title":"RADIUM: Predicting and Repairing End-to-End Robot Failures using\n Gradient-Accelerated Sampling","summary":" Before autonomous systems can be deployed in safety-critical applications, we\nmust be able to understand and verify the safety of these systems. For cases\nwhere the risk or cost of real-world testing is prohibitive, we propose a\nsimulation-based framework for a) predicting ways in which an autonomous system\nis likely to fail and b) automatically adjusting the system's design and\ncontrol policy to preemptively mitigate those failures. Existing tools for\nfailure prediction struggle to search over high-dimensional environmental\nparameters, cannot efficiently handle end-to-end testing for systems with\nvision in the loop, and provide little guidance on how to mitigate failures\nonce they are discovered. We approach this problem through the lens of\napproximate Bayesian inference and use differentiable simulation and rendering\nfor efficient failure case prediction and repair. For cases where a\ndifferentiable simulator is not available, we provide a gradient-free version\nof our algorithm, and we include a theoretical and empirical evaluation of the\ntrade-offs between gradient-based and gradient-free methods. We apply our\napproach on a range of robotics and control problems, including optimizing\nsearch patterns for robot swarms, UAV formation control, and robust network\ncontrol. Compared to optimization-based falsification methods, our method\npredicts a more diverse, representative set of failure modes, and we find that\nour use of differentiable simulation yields solutions that have up to 10x lower\ncost and requires up to 2x fewer iterations to converge relative to\ngradient-free techniques. In hardware experiments, we find that repairing\ncontrol policies using our method leads to a 5x robustness improvement.\nAccompanying code and video can be found at https://mit-realm.github.io/radium/\n","authors":["Charles Dawson","Anjali Parashar","Chuchu Fan"],"pdf_url":"https://arxiv.org/pdf/2404.03412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03386v1","updated":"2024-04-04T11:37:55Z","published":"2024-04-04T11:37:55Z","title":"SENSOR: Imitate Third-Person Expert's Behaviors via Active Sensoring","summary":" In many real-world visual Imitation Learning (IL) scenarios, there is a\nmisalignment between the agent's and the expert's perspectives, which might\nlead to the failure of imitation. Previous methods have generally solved this\nproblem by domain alignment, which incurs extra computation and storage costs,\nand these methods fail to handle the \\textit{hard cases} where the viewpoint\ngap is too large. To alleviate the above problems, we introduce active\nsensoring in the visual IL setting and propose a model-based SENSory imitatOR\n(SENSOR) to automatically change the agent's perspective to match the expert's.\nSENSOR jointly learns a world model to capture the dynamics of latent states, a\nsensor policy to control the camera, and a motor policy to control the agent.\nExperiments on visual locomotion tasks show that SENSOR can efficiently\nsimulate the expert's perspective and strategy, and outperforms most baseline\nmethods.\n","authors":["Kaichen Huang","Minghao Shao","Shenghua Wan","Hai-Hang Sun","Shuai Feng","Le Gan","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2404.03386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03363v1","updated":"2024-04-04T11:04:44Z","published":"2024-04-04T11:04:44Z","title":"Space Physiology and Technology: Musculoskeletal Adaptations,\n Countermeasures, and the Opportunity for Wearable Robotics","summary":" Space poses significant challenges for human physiology, leading to\nphysiological adaptations in response to an environment vastly different from\nEarth. While these adaptations can be beneficial, they may not fully counteract\nthe adverse impact of space-related stressors. A comprehensive understanding of\nthese physiological adaptations is needed to devise effective countermeasures\nto support human life in space. This review focuses on the impact of the\nenvironment in space on the musculoskeletal system. It highlights the complex\ninterplay between bone and muscle adaptation, the underlying physiological\nmechanisms, and their implications on astronaut health. Furthermore, the review\ndelves into the deployed and current advances in countermeasures and proposes,\nas a perspective for future developments, wearable sensing and robotic\ntechnologies, such as exoskeletons, as a fitting alternative.\n","authors":["Shamas Ul Ebad Khan","Rejin John Varghese","Panagiotis Kassanos","Dario Farina","Etienne Burdet"],"pdf_url":"https://arxiv.org/pdf/2404.03363v1.pdf","comment":"23 pages (including references), 8 figures and 318 references"},{"id":"http://arxiv.org/abs/2404.03336v1","updated":"2024-04-04T10:04:44Z","published":"2024-04-04T10:04:44Z","title":"Scaling Population-Based Reinforcement Learning with GPU Accelerated\n Simulation","summary":" In recent years, deep reinforcement learning (RL) has shown its effectiveness\nin solving complex continuous control tasks like locomotion and dexterous\nmanipulation. However, this comes at the cost of an enormous amount of\nexperience required for training, exacerbated by the sensitivity of learning\nefficiency and the policy performance to hyperparameter selection, which often\nrequires numerous trials of time-consuming experiments. This work introduces a\nPopulation-Based Reinforcement Learning (PBRL) approach that exploits a\nGPU-accelerated physics simulator to enhance the exploration capabilities of RL\nby concurrently training multiple policies in parallel. The PBRL framework is\napplied to three state-of-the-art RL algorithms -- PPO, SAC, and DDPG --\ndynamically adjusting hyperparameters based on the performance of learning\nagents. The experiments are performed on four challenging tasks in Isaac Gym --\nAnymal Terrain, Shadow Hand, Humanoid, Franka Nut Pick -- by analyzing the\neffect of population size and mutation mechanisms for hyperparameters. The\nresults show that PBRL agents achieve superior performance, in terms of\ncumulative reward, compared to non-evolutionary baseline agents. The trained\nagents are finally deployed in the real world for a Franka Nut Pick} task,\ndemonstrating successful sim-to-real transfer. Code and videos of the learned\npolicies are available on our project website.\n","authors":["Asad Ali Shahid","Yashraj Narang","Vincenzo Petrone","Enrico Ferrentino","Ankur Handa","Dieter Fox","Marco Pavone","Loris Roveda"],"pdf_url":"https://arxiv.org/pdf/2404.03336v1.pdf","comment":"Submitted for publication to IEEE Robotics and Automation Letters\n (RA-L)"},{"id":"http://arxiv.org/abs/2404.02771v2","updated":"2024-04-04T09:58:37Z","published":"2024-04-03T14:37:00Z","title":"Forming Large Patterns with Local Robots in the OBLOT Model","summary":" In the arbitrary pattern formation problem, $n$ autonomous, mobile robots\nmust form an arbitrary pattern $P \\subseteq \\mathbb{R}^2$. The (deterministic)\nrobots are typically assumed to be indistinguishable, disoriented, and unable\nto communicate. An important distinction is whether robots have memory and/or a\nlimited viewing range. Previous work managed to form $P$ under a natural\nsymmetry condition if robots have no memory but an unlimited viewing range [22]\nor if robots have a limited viewing range but memory [25]. In the latter case,\n$P$ is only formed in a shrunk version that has constant diameter.\n Without memory and with limited viewing range, forming arbitrary patterns\nremains an open problem. We provide a partial solution by showing that $P$ can\nbe formed under the same symmetry condition if the robots' initial diameter is\n$\\leq 1$. Our protocol partitions $P$ into rotation-symmetric components and\nexploits the initial mutual visibility to form one cluster per component. Using\na careful placement of the clusters and their robots, we show that a cluster\ncan move in a coordinated way through its component while drawing $P$ by\ndropping one robot per pattern coordinate.\n","authors":["Christopher Hahn","Jonas Harbig","Peter Kling"],"pdf_url":"https://arxiv.org/pdf/2404.02771v2.pdf","comment":"24 pages, 3 figures, submitted for SAND 2024, version with extended\n appendix"},{"id":"http://arxiv.org/abs/2404.03325v1","updated":"2024-04-04T09:52:22Z","published":"2024-04-04T09:52:22Z","title":"Embodied Neuromorphic Artificial Intelligence for Robotics:\n Perspectives, Challenges, and Research Development Stack","summary":" Robotic technologies have been an indispensable part for improving human\nproductivity since they have been helping humans in completing diverse,\ncomplex, and intensive tasks in a fast yet accurate and efficient way.\nTherefore, robotic technologies have been deployed in a wide range of\napplications, ranging from personal to industrial use-cases. However, current\nrobotic technologies and their computing paradigm still lack embodied\nintelligence to efficiently interact with operational environments, respond\nwith correct/expected actions, and adapt to changes in the environments. Toward\nthis, recent advances in neuromorphic computing with Spiking Neural Networks\n(SNN) have demonstrated the potential to enable the embodied intelligence for\nrobotics through bio-plausible computing paradigm that mimics how the\nbiological brain works, known as \"neuromorphic artificial intelligence (AI)\".\nHowever, the field of neuromorphic AI-based robotics is still at an early\nstage, therefore its development and deployment for solving real-world problems\nexpose new challenges in different design aspects, such as accuracy,\nadaptability, efficiency, reliability, and security. To address these\nchallenges, this paper will discuss how we can enable embodied neuromorphic AI\nfor robotic systems through our perspectives: (P1) Embodied intelligence based\non effective learning rule, training mechanism, and adaptability; (P2)\nCross-layer optimizations for energy-efficient neuromorphic computing; (P3)\nRepresentative and fair benchmarks; (P4) Low-cost reliability and safety\nenhancements; (P5) Security and privacy for neuromorphic computing; and (P6) A\nsynergistic development for energy-efficient and robust neuromorphic-based\nrobotics. Furthermore, this paper identifies research challenges and\nopportunities, as well as elaborates our vision for future research development\ntoward embodied neuromorphic AI for robotics.\n","authors":["Rachmad Vidya Wicaksana Putra","Alberto Marchisio","Fakhreddine Zayer","Jorge Dias","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2404.03325v1.pdf","comment":"8 pages, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2404.03307v1","updated":"2024-04-04T09:01:17Z","published":"2024-04-04T09:01:17Z","title":"Bi-level Trajectory Optimization on Uneven Terrains with Differentiable\n Wheel-Terrain Interaction Model","summary":" Navigation of wheeled vehicles on uneven terrain necessitates going beyond\nthe 2D approaches for trajectory planning. Specifically, it is essential to\nincorporate the full 6dof variation of vehicle pose and its associated\nstability cost in the planning process. To this end, most recent works aim to\nlearn a neural network model to predict the vehicle evolution. However, such\napproaches are data-intensive and fraught with generalization issues. In this\npaper, we present a purely model-based approach that just requires the digital\nelevation information of the terrain. Specifically, we express the\nwheel-terrain interaction and 6dof pose prediction as a non-linear least\nsquares (NLS) problem. As a result, trajectory planning can be viewed as a\nbi-level optimization. The inner optimization layer predicts the pose on the\nterrain along a given trajectory, while the outer layer deforms the trajectory\nitself to reduce the stability and kinematic costs of the pose. We improve the\nstate-of-the-art in the following respects. First, we show that our NLS based\npose prediction closely matches the output from a high-fidelity physics engine.\nThis result coupled with the fact that we can query gradients of the NLS\nsolver, makes our pose predictor, a differentiable wheel-terrain interaction\nmodel. We further leverage this differentiability to efficiently solve the\nproposed bi-level trajectory optimization problem. Finally, we perform\nextensive experiments, and comparison with a baseline to showcase the\neffectiveness of our approach in obtaining smooth, stable trajectories.\n","authors":["Amith Manoharan","Aditya Sharma","Himani Belsare","Kaustab Pal","K. Madhava Krishna","Arun Kumar Singh"],"pdf_url":"https://arxiv.org/pdf/2404.03307v1.pdf","comment":"8 pages, 7 figures, submitted to IEEE/RSJ International Conference on\n Intelligent Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2312.11843v2","updated":"2024-04-04T08:30:59Z","published":"2023-12-19T04:15:59Z","title":"Enhancing Social Decision-Making of Autonomous Vehicles: A\n Mixed-Strategy Game Approach With Interaction Orientation Identification","summary":" The integration of Autonomous Vehicles (AVs) into existing human-driven\ntraffic systems poses considerable challenges, especially within environments\nwhere human and machine interactions are frequent and complex, such as at\nunsignalized intersections. To deal with these challenges, we introduce a novel\nframework predicated on dynamic and socially-aware decision-making game theory\nto augment the social decision-making prowess of AVs in mixed driving\nenvironments. This comprehensive framework is delineated into three primary\nmodules: Interaction Orientation Identification, Mixed-Strategy Game Modeling,\nand Expert Mode Learning. We introduce 'Interaction Orientation' as a metric to\nevaluate the social decision-making tendencies of various agents, incorporating\nboth environmental factors and trajectory characteristics. The mixed-strategy\ngame model developed as part of this framework considers the evolution of\nfuture traffic scenarios and includes a utility function that balances safety,\noperational efficiency, and the unpredictability of environmental conditions.\nTo adapt to real-world driving complexities, our framework utilizes a dynamic\noptimization framework for assimilating and learning from expert human driving\nstrategies. These strategies are compiled into a comprehensive strategy\nlibrary, serving as a reference for future decision-making processes. The\nproposed approach is validated through extensive driving datasets and\nhuman-in-loop driving experiments, and the results demonstrate marked\nenhancements in decision timing and precision.\n","authors":["Jiaqi Liu","Xiao Qi","Peng Hang","Jian Sun"],"pdf_url":"https://arxiv.org/pdf/2312.11843v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07459v2","updated":"2024-04-04T08:24:54Z","published":"2022-11-14T15:37:27Z","title":"Self-Aligning Depth-regularized Radiance Fields for Asynchronous RGB-D\n Sequences","summary":" It has been shown that learning radiance fields with depth rendering and\ndepth supervision can effectively promote the quality and convergence of view\nsynthesis. However, this paradigm requires input RGB-D sequences to be\nsynchronized, hindering its usage in the UAV city modeling scenario. As there\nexists asynchrony between RGB images and depth images due to high-speed flight,\nwe propose a novel time-pose function, which is an implicit network that maps\ntimestamps to $\\rm SE(3)$ elements. To simplify the training process, we also\ndesign a joint optimization scheme to jointly learn the large-scale\ndepth-regularized radiance fields and the time-pose function. Our algorithm\nconsists of three steps: (1) time-pose function fitting, (2) radiance field\nbootstrapping, (3) joint pose error compensation and radiance field refinement.\nIn addition, we propose a large synthetic dataset with diverse controlled\nmismatches and ground truth to evaluate this new problem setting\nsystematically. Through extensive experiments, we demonstrate that our method\noutperforms baselines without regularization. We also show qualitatively\nimproved results on a real-world asynchronous RGB-D sequence captured by drone.\nCodes, data, and models will be made publicly available.\n","authors":["Yuxin Huang","Andong Yang","Zirui Wu","Yuantao Chen","Runyi Yang","Zhenxin Zhu","Chao Hou","Hao Zhao","Guyue Zhou"],"pdf_url":"https://arxiv.org/pdf/2211.07459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01836v2","updated":"2024-04-04T08:20:23Z","published":"2024-04-02T10:48:36Z","title":"CARLOS: An Open, Modular, and Scalable Simulation Framework for the\n Development and Testing of Software for C-ITS","summary":" Future mobility systems and their components are increasingly defined by\ntheir software. The complexity of these cooperative intelligent transport\nsystems (C-ITS) and the everchanging requirements posed at the software require\ncontinual software updates. The dynamic nature of the system and the\npractically innumerable scenarios in which different software components work\ntogether necessitate efficient and automated development and testing procedures\nthat use simulations as one core methodology. The availability of such\nsimulation architectures is a common interest among many stakeholders,\nespecially in the field of automated driving. That is why we propose CARLOS -\nan open, modular, and scalable simulation framework for the development and\ntesting of software in C-ITS that leverages the rich CARLA and ROS ecosystems.\nWe provide core building blocks for this framework and explain how it can be\nused and extended by the community. Its architecture builds upon modern\nmicroservice and DevOps principles such as containerization and continuous\nintegration. In our paper, we motivate the architecture by describing important\ndesign principles and showcasing three major use cases - software prototyping,\ndata-driven development, and automated testing. We make CARLOS and example\nimplementations of the three use cases publicly available at\ngithub.com/ika-rwth-aachen/carlos\n","authors":["Christian Geller","Benedikt Haas","Amarin Kloeker","Jona Hermens","Bastian Lampe","Lutz Eckstein"],"pdf_url":"https://arxiv.org/pdf/2404.01836v2.pdf","comment":"7 pages, 5 figures, 1 table; Accepted to be published as part of the\n 35th IEEE Intelligent Vehicles Symposium (IV), Jeju Island, Korea, June 2-5,\n 2024"},{"id":"http://arxiv.org/abs/2404.03275v1","updated":"2024-04-04T07:59:24Z","published":"2024-04-04T07:59:24Z","title":"DELTA: Decomposed Efficient Long-Term Robot Task Planning using Large\n Language Models","summary":" Recent advancements in Large Language Models (LLMs) have sparked a revolution\nacross various research fields. In particular, the integration of common-sense\nknowledge from LLMs into robot task and motion planning has been proven to be a\ngame-changer, elevating performance in terms of explainability and downstream\ntask efficiency to unprecedented heights. However, managing the vast knowledge\nencapsulated within these large models has posed challenges, often resulting in\ninfeasible plans generated by LLM-based planning systems due to hallucinations\nor missing domain information. To overcome these challenges and obtain even\ngreater planning feasibility and computational efficiency, we propose a novel\nLLM-driven task planning approach called DELTA. For achieving better grounding\nfrom environmental topology into actionable knowledge, DELTA leverages the\npower of scene graphs as environment representations within LLMs, enabling the\nfast generation of precise planning problem descriptions. For obtaining higher\nplanning performance, we use LLMs to decompose the long-term task goals into an\nautoregressive sequence of sub-goals for an automated task planner to solve.\nOur contribution enables a more efficient and fully automatic task planning\npipeline, achieving higher planning success rates and significantly shorter\nplanning times compared to the state of the art.\n","authors":["Yuchen Liu","Luigi Palmieri","Sebastian Koch","Ilche Georgievski","Marco Aiello"],"pdf_url":"https://arxiv.org/pdf/2404.03275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03274v1","updated":"2024-04-04T07:59:18Z","published":"2024-04-04T07:59:18Z","title":"Traversability-aware Adaptive Optimization for Path Planning and Control\n in Mountainous Terrain","summary":" Autonomous navigation in extreme mountainous terrains poses challenges due to\nthe presence of mobility-stressing elements and undulating surfaces, making it\nparticularly difficult compared to conventional off-road driving scenarios. In\nsuch environments, estimating traversability solely based on exteroceptive\nsensors often leads to the inability to reach the goal due to a high prevalence\nof non-traversable areas. In this paper, we consider traversability as a\nrelative value that integrates the robot's internal state, such as speed and\ntorque to exhibit resilient behavior to reach its goal successfully. We\nseparate traversability into apparent traversability and relative\ntraversability, then incorporate these distinctions in the optimization process\nof sampling-based planning and motion predictive control. Our method enables\nthe robots to execute the desired behaviors more accurately while avoiding\nhazardous regions and getting stuck. Experiments conducted on simulation with\n27 diverse types of mountainous terrain and real-world demonstrate the\nrobustness of the proposed framework, with increasingly better performance\nobserved in more complex environments.\n","authors":["Se-Wook Yoo","E In Son","Seung-Woo Seo"],"pdf_url":"https://arxiv.org/pdf/2404.03274v1.pdf","comment":"8 pages, 7 figures, accepted 2024 RA-L"},{"id":"http://arxiv.org/abs/2404.03251v1","updated":"2024-04-04T07:14:12Z","published":"2024-04-04T07:14:12Z","title":"Real-time Noise Source Estimation of a Camera System from an Image and\n Metadata","summary":" Autonomous machines must self-maintain proper functionality to ensure the\nsafety of humans and themselves. This pertains particularly to its cameras as\npredominant sensors to perceive the environment and support actions. A\nfundamental camera problem addressed in this study is noise. Solutions often\nfocus on denoising images a posteriori, that is, fighting symptoms rather than\nroot causes. However, tackling root causes requires identifying the noise\nsources, considering the limitations of mobile platforms. This work\ninvestigates a real-time, memory-efficient and reliable noise source estimator\nthat combines data- and physically-based models. To this end, a DNN that\nexamines an image with camera metadata for major camera noise sources is built\nand trained. In addition, it quantifies unexpected factors that impact image\nnoise or metadata. This study investigates seven different estimators on six\ndatasets that include synthetic noise, real-world noise from two camera\nsystems, and real field campaigns. For these, only the model with most metadata\nis capable to accurately and robustly quantify all individual noise\ncontributions. This method outperforms total image noise estimators and can be\nplug-and-play deployed. It also serves as a basis to include more advanced\nnoise sources, or as part of an automatic countermeasure feedback-loop to\napproach fully reliable machines.\n","authors":["Maik Wischow","Patrick Irmisch","Anko Boerner","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2404.03251v1.pdf","comment":"16 pages, 16 figures, 12 tables, Project page:\n https://github.com/MaikWischow/Noise-Source-Estimation"},{"id":"http://arxiv.org/abs/2303.05038v2","updated":"2024-04-04T05:37:52Z","published":"2023-03-09T05:11:30Z","title":"Exploiting Contextual Structure to Generate Useful Auxiliary Tasks","summary":" Reinforcement learning requires interaction with an environment, which is\nexpensive for robots. This constraint necessitates approaches that work with\nlimited environmental interaction by maximizing the reuse of previous\nexperiences. We propose an approach that maximizes experience reuse while\nlearning to solve a given task by generating and simultaneously learning useful\nauxiliary tasks. To generate these tasks, we construct an abstract temporal\nlogic representation of the given task and leverage large language models to\ngenerate context-aware object embeddings that facilitate object replacements.\nCounterfactual reasoning and off-policy methods allow us to simultaneously\nlearn these auxiliary tasks while solving the given target task. We combine\nthese insights into a novel framework for multitask reinforcement learning and\nexperimentally show that our generated auxiliary tasks share similar underlying\nexploration requirements as the given task, thereby maximizing the utility of\ndirected exploration. Our approach allows agents to automatically learn\nadditional useful policies without extra environment interaction.\n","authors":["Benedict Quartey","Ankit Shah","George Konidaris"],"pdf_url":"https://arxiv.org/pdf/2303.05038v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.09934v7","updated":"2024-04-04T04:52:43Z","published":"2022-07-20T14:20:35Z","title":"DeepIPC: Deeply Integrated Perception and Control for an Autonomous\n Vehicle in Real Environments","summary":" In this work, we introduce DeepIPC, a novel end-to-end model tailored for\nautonomous driving, which seamlessly integrates perception and control tasks.\nUnlike traditional models that handle these tasks separately, DeepIPC\ninnovatively combines a perception module, which processes RGBD images for\nsemantic segmentation and generates bird's eye view (BEV) mappings, with a\ncontroller module that utilizes these insights along with GNSS and angular\nspeed measurements to accurately predict navigational waypoints. This\nintegration allows DeepIPC to efficiently translate complex environmental data\ninto actionable driving commands. Our comprehensive evaluation demonstrates\nDeepIPC's superior performance in terms of drivability and multi-task\nefficiency across diverse real-world scenarios, setting a new benchmark for\nend-to-end autonomous driving systems with a leaner model architecture. The\nexperimental results underscore DeepIPC's potential to significantly enhance\nautonomous vehicular navigation, promising a step forward in the development of\nautonomous driving technologies. For further insights and replication, we will\nmake our code and datasets available at https://github.com/oskarnatan/DeepIPC.\n","authors":["Oskar Natan","Jun Miura"],"pdf_url":"https://arxiv.org/pdf/2207.09934v7.pdf","comment":"Accepted for Publication in IEEE Access"},{"id":"http://arxiv.org/abs/2404.03190v1","updated":"2024-04-04T04:22:25Z","published":"2024-04-04T04:22:25Z","title":"Adaptive Discrete Disparity Volume for Self-supervised Monocular Depth\n Estimation","summary":" In self-supervised monocular depth estimation tasks, discrete disparity\nprediction has been proven to attain higher quality depth maps than common\ncontinuous methods. However, current discretization strategies often divide\ndepth ranges of scenes into bins in a handcrafted and rigid manner, limiting\nmodel performance. In this paper, we propose a learnable module, Adaptive\nDiscrete Disparity Volume (ADDV), which is capable of dynamically sensing depth\ndistributions in different RGB images and generating adaptive bins for them.\nWithout any extra supervision, this module can be integrated into existing CNN\narchitectures, allowing networks to produce representative values for bins and\na probability volume over them. Furthermore, we introduce novel training\nstrategies - uniformizing and sharpening - through a loss term and temperature\nparameter, respectively, to provide regularizations under self-supervised\nconditions, preventing model degradation or collapse. Empirical results\ndemonstrate that ADDV effectively processes global information, generating\nappropriate bins for various scenes and producing higher quality depth maps\ncompared to handcrafted methods.\n","authors":["Jianwei Ren"],"pdf_url":"https://arxiv.org/pdf/2404.03190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06647v3","updated":"2024-04-04T04:07:48Z","published":"2023-07-13T09:23:21Z","title":"DeepIPCv2: LiDAR-powered Robust Environmental Perception and\n Navigational Control for Autonomous Vehicle","summary":" We present DeepIPCv2, an autonomous driving model that perceives the\nenvironment using a LiDAR sensor for more robust drivability, especially when\ndriving under poor illumination conditions where everything is not clearly\nvisible. DeepIPCv2 takes a set of LiDAR point clouds as the main perception\ninput. Since point clouds are not affected by illumination changes, they can\nprovide a clear observation of the surroundings no matter what the condition\nis. This results in a better scene understanding and stable features provided\nby the perception module to support the controller module in estimating\nnavigational control properly. To evaluate its performance, we conduct several\ntests by deploying the model to predict a set of driving records and perform\nreal automated driving under three different conditions. We also conduct\nablation and comparative studies with some recent models to justify its\nperformance. Based on the experimental results, DeepIPCv2 shows a robust\nperformance by achieving the best drivability in all driving scenarios.\nFurthermore, to support future research, we will upload the codes and data to\nhttps://github.com/oskarnatan/DeepIPCv2.\n","authors":["Oskar Natan","Jun Miura"],"pdf_url":"https://arxiv.org/pdf/2307.06647v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03186v1","updated":"2024-04-04T04:02:05Z","published":"2024-04-04T04:02:05Z","title":"RAnGE: Reachability Analysis for Guaranteed Ergodicity","summary":" This paper investigates performance guarantees on coverage-based ergodic\nexploration methods in environments containing disturbances. Ergodic\nexploration methods generate trajectories for autonomous robots such that time\nspent in an area is proportional to the utility of exploring in the area.\nHowever, providing formal performance guarantees for ergodic exploration\nmethods is still an open challenge due to the complexities in the problem\nformulation. In this work, we propose to formulate ergodic search as a\ndifferential game, in which a controller and external disturbance force seek to\nminimize and maximize the ergodic metric, respectively. Through an\nextended-state Bolza-form transform of the ergodic problem, we demonstrate it\nis possible to use techniques from reachability analysis to solve for optimal\ncontrollers that guarantee coverage and are robust against disturbances. Our\napproach leverages neural-network based methods to obtain approximate value\nfunction solutions for reachability problems that mitigate the increased\ncomputational scaling due to the extended state. As a result, we are able to\ncompute continuous value functions for the ergodic exploration problem and\nprovide performance guarantees for coverage under disturbances. Simulated and\nexperimental results demonstrate the efficacy of our approach to generate\nrobust ergodic trajectories for search and exploration with external\ndisturbance force.\n","authors":["Henry Berger","Ian Abraham"],"pdf_url":"https://arxiv.org/pdf/2404.03186v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.00156v3","updated":"2024-04-04T02:03:20Z","published":"2023-09-29T21:32:42Z","title":"Learning Generalizable Tool-use Skills through Trajectory Generation","summary":" Autonomous systems that efficiently utilize tools can assist humans in\ncompleting many common tasks such as cooking and cleaning. However, current\nsystems fall short of matching human-level of intelligence in terms of adapting\nto novel tools. Prior works based on affordance often make strong assumptions\nabout the environments and cannot scale to more complex, contact-rich tasks. In\nthis work, we tackle this challenge and explore how agents can learn to use\npreviously unseen tools to manipulate deformable objects. We propose to learn a\ngenerative model of the tool-use trajectories as a sequence of tool point\nclouds, which generalizes to different tool shapes. Given any novel tool, we\nfirst generate a tool-use trajectory and then optimize the sequence of tool\nposes to align with the generated trajectory. We train a single model on four\ndifferent challenging deformable object manipulation tasks, using demonstration\ndata from only one tool per task. The model generalizes to various novel tools,\nsignificantly outperforming baselines. We further test our trained policy in\nthe real world with unseen tools, where it achieves the performance comparable\nto human. Additional materials can be found on our project website:\nhttps://sites.google.com/view/toolgen.\n","authors":["Carl Qi","Yilin Wu","Lifan Yu","Haoyue Liu","Bowen Jiang","Xingyu Lin","David Held"],"pdf_url":"https://arxiv.org/pdf/2310.00156v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03149v1","updated":"2024-04-04T01:46:31Z","published":"2024-04-04T01:46:31Z","title":"Design and Evaluation of a Compact 3D End-effector Assistive Robot for\n Adaptive Arm Support","summary":" We developed a 3D end-effector type of upper limb assistive robot, named as\nAssistive Robotic Arm Extender (ARAE), that provides transparency movement and\nadaptive arm support control to achieve home-based therapy and training in the\nreal environment. The proposed system composes five degrees of freedom,\nincluding three active motors and two passive joints at the end-effector\nmodule. The core structure of the system is based on a parallel mechanism. The\nkinematic and dynamic modeling are illustrated in detail. The proposed adaptive\narm support control framework calculates the compensated force based on the\nestimated human arm posture in 3D space. It firstly estimates human arm joint\nangles using two proposed methods: fixed torso and sagittal plane models\nwithout using external sensors such as IMUs, magnetic sensors, or depth\ncameras. The experiments were carried out to evaluate the performance of the\ntwo proposed angle estimation methods. Then, the estimated human joint angles\nwere input into the human upper limb dynamics model to derive the required\nsupport force generated by the robot. The muscular activities were measured to\nevaluate the effects of the proposed framework. The obvious reduction of\nmuscular activities was exhibited when participants were tested with the ARAE\nunder an adaptive arm gravity compensation control framework. The overall\nresults suggest that the ARAE system, when combined with the proposed control\nframework, has the potential to offer adaptive arm support. This integration\ncould enable effective training with Activities of Daily Living (ADLs) and\ninteraction with real environments.\n","authors":["Sibo Yang","Lincong Luo","Wei Chuan Law","Youlong Wang","Lei Li","Wei Tech Ang"],"pdf_url":"https://arxiv.org/pdf/2404.03149v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2311.05780v2","updated":"2024-04-04T01:43:42Z","published":"2023-11-09T22:57:21Z","title":"Real-time Control of Electric Autonomous Mobility-on-Demand Systems via\n Graph Reinforcement Learning","summary":" Operators of Electric Autonomous Mobility-on-Demand (E-AMoD) fleets need to\nmake several real-time decisions such as matching available vehicles to ride\nrequests, rebalancing idle vehicles to areas of high demand, and charging\nvehicles to ensure sufficient range. While this problem can be posed as a\nlinear program that optimizes flows over a space-charge-time graph, the size of\nthe resulting optimization problem does not allow for real-time implementation\nin realistic settings. In this work, we present the E-AMoD control problem\nthrough the lens of reinforcement learning and propose a graph network-based\nframework to achieve drastically improved scalability and superior performance\nover heuristics. Specifically, we adopt a bi-level formulation where we (1)\nleverage a graph network-based RL agent to specify a desired next state in the\nspace-charge graph, and (2) solve more tractable linear programs to best\nachieve the desired state while ensuring feasibility. Experiments using\nreal-world data from San Francisco and New York City show that our approach\nachieves up to 89% of the profits of the theoretically-optimal solution while\nachieving more than a 100x speedup in computational time. We further highlight\npromising zero-shot transfer capabilities of our learned policy on tasks such\nas inter-city generalization and service area expansion, thus showing the\nutility, scalability, and flexibility of our framework. Finally, our approach\noutperforms the best domain-specific heuristics with comparable runtimes, with\nan increase in profits by up to 3.2x.\n","authors":["Aaryan Singhal","Daniele Gammelli","Justin Luke","Karthik Gopalakrishnan","Dominik Helmreich","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2311.05780v2.pdf","comment":"9 pages, revised SF travel data, includes additional experimental\n results, content and clarification revisions per reviewer feedback, and typo\n fixes"},{"id":"http://arxiv.org/abs/2404.03133v1","updated":"2024-04-04T00:58:19Z","published":"2024-04-04T00:58:19Z","title":"A Framework for Guided Motion Planning","summary":" Randomized sampling based algorithms are widely used in robot motion planning\ndue to the problem's intractability, and are experimentally effective on a wide\nrange of problem instances. Most variants bias their sampling using various\nheuristics related to the known underlying structure of the search space. In\nthis work, we formalize the intuitive notion of guided search by defining the\nconcept of a guiding space. This new language encapsulates many seemingly\ndistinct prior methods under the same framework, and allows us to reason about\nguidance, a previously obscured core contribution of different algorithms. We\nsuggest an information theoretic method to evaluate guidance, which\nexperimentally matches intuition when tested on known algorithms in a variety\nof environments. The language and evaluation of guidance suggests improvements\nto existing methods, and allows for simple hybrid algorithms that combine\nguidance from multiple sources.\n","authors":["Amnon Attali","Stav Ashur","Isaac Burton Love","Courtney McBeth","James Motes","Marco Morales","Nancy M. Amato"],"pdf_url":"https://arxiv.org/pdf/2404.03133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02136v2","updated":"2024-04-04T00:47:48Z","published":"2023-08-04T04:58:06Z","title":"World-Model-Based Control for Industrial box-packing of Multiple Objects\n using NewtonianVAE","summary":" The process of industrial box-packing, which involves the accurate placement\nof multiple objects, requires high-accuracy positioning and sequential actions.\nWhen a robot is tasked with placing an object at a specific location with high\naccuracy, it is important not only to have information about the location of\nthe object to be placed, but also the posture of the object grasped by the\nrobotic hand. Often, industrial box-packing requires the sequential placement\nof identically shaped objects into a single box. The robot's action should be\ndetermined by the same learned model. In factories, new kinds of products often\nappear and there is a need for a model that can easily adapt to them.\nTherefore, it should be easy to collect data to train the model. In this study,\nwe designed a robotic system to automate real-world industrial tasks, employing\na vision-based learning control model. We propose in-hand-view-sensitive\nNewtonian variational autoencoder (ihVS-NVAE), which employs an RGB camera to\nobtain in-hand postures of objects. We demonstrate that our model, trained for\na single object-placement task, can handle sequential tasks without additional\ntraining. To evaluate efficacy of the proposed model, we employed a real robot\nto perform sequential industrial box-packing of multiple objects. Results\nshowed that the proposed model achieved a 100% success rate in industrial\nbox-packing tasks, thereby outperforming the state-of-the-art and conventional\napproaches, underscoring its superior effectiveness and potential in industrial\ntasks.\n","authors":["Yusuke Kato","Ryo Okumura","Tadahiro Taniguchi"],"pdf_url":"https://arxiv.org/pdf/2308.02136v2.pdf","comment":"7 pages, 8 figures"},{"id":"http://arxiv.org/abs/2109.07201v2","updated":"2024-04-04T12:41:01Z","published":"2021-09-15T10:40:28Z","title":"Expectable Motion Unit: Avoiding Hazards From Human Involuntary Motions\n in Human-Robot Interaction","summary":" In robotics, many control and planning schemes have been developed to ensure\nhuman physical safety in human- robot interaction. The human psychological\nstate and the ex- pectation towards the robot, however, are typically\nneglected. Even if the robot behaviour is regarded as biomechanically safe,\nhumans may still react with a rapid involuntary motion (IM) caused by a startle\nor surprise. Such sudden, uncontrolled motions can jeopardize safety and should\nbe prevented by any means. In this letter, we propose the Expectable Motion\nUnit (EMU), which ensures that a certain probability of IM occurrence is not\nexceeded in a typical HRI setting. Based on a model of IM occurrence generated\nthrough an experiment with 29 participants, we establish the mapping between\nrobot velocity, robot-human distance, and the relative frequency of IM\noccurrence. This mapping is processed towards a real-time capable robot motion\ngenerator that limits the robot velocity during task execution if necessary.\nThe EMU is combined in a holistic safety framework that integrates both the\nphysical and psychological safety knowledge. A validation experiment showed\nthat the EMU successfully avoids human IM in five out of six cases.\n","authors":["Robin Jeanne Kirschner","Henning Mayer","Lisa Burr","Nico Mansfeld","Saeed Abdolshah","Sami Haddadin"],"pdf_url":"https://arxiv.org/pdf/2109.07201v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03834v1","updated":"2024-04-04T23:31:14Z","published":"2024-04-04T23:31:14Z","title":"Fast k-connectivity Restoration in Multi-Robot Systems for Robust\n Communication Maintenance","summary":" Maintaining a robust communication network plays an important role in the\nsuccess of a multi-robot team jointly performing an optimization task. A key\ncharacteristic of a robust cooperative multi-robot system is the ability to\nrepair the communication topology in the case of robot failure. In this paper,\nwe focus on the Fast k-connectivity Restoration (FCR) problem, which aims to\nrepair a network to make it k-connected with minimum robot movement. We develop\na Quadratically Constrained Program (QCP) formulation of the FCR problem, which\nprovides a way to optimally solve the problem, but cannot handle large\ninstances due to high computational overhead. We therefore present a scalable\nalgorithm, called EA-SCR, for the FCR problem using graph theoretic concepts.\nBy conducting empirical studies, we demonstrate that the EA-SCR algorithm\nperforms within 10 percent of the optimal while being orders of magnitude\nfaster. We also show that EA-SCR outperforms existing solutions by 30 percent\nin terms of the FCR distance metric.\n","authors":["Md Ishat-E-Rabban","Guangyao Shi","Griffin Bonner","Pratap Tokekar"],"pdf_url":"https://arxiv.org/pdf/2404.03834v1.pdf","comment":"17 pages, 6 figures, 3 algorithms. arXiv admin note: text overlap\n with arXiv:2011.00685"},{"id":"http://arxiv.org/abs/2201.12900v2","updated":"2024-04-04T21:54:18Z","published":"2022-01-30T19:51:28Z","title":"Learning Optimal Topology for Ad-hoc Robot Networks","summary":" In this paper, we synthesize a data-driven method to predict the optimal\ntopology of an ad-hoc robot network. This problem is technically a multi-task\nclassification problem. However, we divide it into a class of multi-class\nclassification problems that can be more efficiently solved. For this purpose,\nwe first compose an algorithm to create ground-truth optimal topologies\nassociated with various configurations of a robot network. This algorithm\nincorporates a complex collection of optimality criteria that our learning\nmodel successfully manages to learn. This model is an stacked ensemble whose\noutput is the topology prediction for a particular robot. Each stacked ensemble\ninstance constitutes three low-level estimators whose outputs will be\naggregated by a high-level boosting blender. Applying our model to a network of\n10 robots displays over 80% accuracy in the prediction of optimal topologies\ncorresponding to various configurations of the cited network.\n","authors":["Matin Macktoobian","Zhan Shu","Qing Zhao"],"pdf_url":"https://arxiv.org/pdf/2201.12900v2.pdf","comment":"This version is the one published in IEEE Robotics and Automation\n Letters"},{"id":"http://arxiv.org/abs/2404.03816v1","updated":"2024-04-04T21:47:34Z","published":"2024-04-04T21:47:34Z","title":"Accounting for Hysteresis in the Forward Kinematics of\n Nonlinearly-Routed Tendon-Driven Continuum Robots via a Learned Deep Decoder\n Network","summary":" Tendon-driven continuum robots have been gaining popularity in medical\napplications due to their ability to curve around complex anatomical\nstructures, potentially reducing the invasiveness of surgery. However, accurate\nmodeling is required to plan and control the movements of these flexible\nrobots. Physics-based models have limitations due to unmodeled effects, leading\nto mismatches between model prediction and actual robot shape. Recently\nproposed learning-based methods have been shown to overcome some of these\nlimitations but do not account for hysteresis, a significant source of error\nfor these robots. To overcome these challenges, we propose a novel deep decoder\nneural network that predicts the complete shape of tendon-driven robots using\npoint clouds as the shape representation, conditioned on prior configurations\nto account for hysteresis. We evaluate our method on a physical tendon-driven\nrobot and show that our network model accurately predicts the robot's shape,\nsignificantly outperforming a state-of-the-art physics-based model and a\nlearning-based model that does not account for hysteresis.\n","authors":["Brian Y. Cho","Daniel S. Esser","Jordan Thompson","Bao Thach","Robert J. Webster III","Alan Kuntz"],"pdf_url":"https://arxiv.org/pdf/2404.03816v1.pdf","comment":"8 pages, 9 figures, Submitted to IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2310.05041v3","updated":"2024-04-04T21:33:35Z","published":"2023-10-08T06:51:05Z","title":"An Anomaly Behavior Analysis Framework for Securing Autonomous Vehicle\n Perception","summary":" As a rapidly growing cyber-physical platform, Autonomous Vehicles (AVs) are\nencountering more security challenges as their capabilities continue to expand.\nIn recent years, adversaries are actively targeting the perception sensors of\nautonomous vehicles with sophisticated attacks that are not easily detected by\nthe vehicles' control systems. This work proposes an Anomaly Behavior Analysis\napproach to detect a perception sensor attack against an autonomous vehicle.\nThe framework relies on temporal features extracted from a physics-based\nautonomous vehicle behavior model to capture the normal behavior of vehicular\nperception in autonomous driving. By employing a combination of model-based\ntechniques and machine learning algorithms, the proposed framework\ndistinguishes between normal and abnormal vehicular perception behavior. To\ndemonstrate the application of the framework in practice, we performed a depth\ncamera attack experiment on an autonomous vehicle testbed and generated an\nextensive dataset. We validated the effectiveness of the proposed framework\nusing this real-world data and released the dataset for public access. To our\nknowledge, this dataset is the first of its kind and will serve as a valuable\nresource for the research community in evaluating their intrusion detection\ntechniques effectively.\n","authors":["Murad Mehrab Abrar","Salim Hariri"],"pdf_url":"https://arxiv.org/pdf/2310.05041v3.pdf","comment":"20th ACS/IEEE International Conference on Computer Systems and\n Applications (IEEE AICCSA 2023)"},{"id":"http://arxiv.org/abs/2303.01440v4","updated":"2024-04-04T20:17:52Z","published":"2023-03-02T17:57:28Z","title":"Programmatic Imitation Learning from Unlabeled and Noisy Demonstrations","summary":" Imitation Learning (IL) is a promising paradigm for teaching robots to\nperform novel tasks using demonstrations. Most existing approaches for IL\nutilize neural networks (NN), however, these methods suffer from several\nwell-known limitations: they 1) require large amounts of training data, 2) are\nhard to interpret, and 3) are hard to repair and adapt. There is an emerging\ninterest in programmatic imitation learning (PIL), which offers significant\npromise in addressing the above limitations. In PIL, the learned policy is\nrepresented in a programming language, making it amenable to interpretation and\nrepair. However, state-of-the-art PIL algorithms assume access to action labels\nand struggle to learn from noisy real-world demonstrations. In this paper, we\npropose PLUNDER, a novel PIL algorithm that integrates a probabilistic program\nsynthesizer in an iterative Expectation-Maximization (EM) framework to address\nthese shortcomings. Unlike existing PIL approaches, PLUNDER synthesizes\nprobabilistic programmatic policies that are particularly well-suited for\nmodeling the uncertainties inherent in real-world demonstrations. Our approach\nleverages an EM loop to simultaneously infer the missing action labels and the\nmost likely probabilistic policy. We benchmark PLUNDER against several\nestablished IL techniques, and demonstrate its superiority across five\nchallenging imitation learning tasks under noise. PLUNDER policies achieve 95%\naccuracy in matching the given demonstrations, outperforming the next best\nbaseline by 19%. Additionally, policies generated by PLUNDER successfully\ncomplete the tasks 17% more frequently than the nearest baseline.\n","authors":["Jimmy Xin","Linus Zheng","Kia Rahmani","Jiayi Wei","Jarrett Holtz","Isil Dillig","Joydeep Biswas"],"pdf_url":"https://arxiv.org/pdf/2303.01440v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03790v1","updated":"2024-04-04T20:07:26Z","published":"2024-04-04T20:07:26Z","title":"A Bimanual Teleoperation Framework for Light Duty Underwater\n Vehicle-Manipulator Systems","summary":" In an effort to lower the barrier to entry in underwater manipulation, this\npaper presents an open-source, user-friendly framework for bimanual\nteleoperation of a light-duty underwater vehicle-manipulator system (UVMS).\nThis framework allows for the control of the vehicle along with two\nmanipulators and their end-effectors using two low-cost haptic devices.\n The UVMS kinematics are derived in order to create an independent resolved\nmotion rate controller for each manipulator, which optimally controls the joint\npositions to achieve a desired end-effector pose. This desired pose is computed\nin real-time using a teleoperation controller developed to process the dual\nhaptic device input from the user. A physics-based simulation environment is\nused to implement this framework for two example tasks as well as provide data\nfor error analysis of user commands. The first task illustrates the\nfunctionality of the framework through motion control of the vehicle and\nmanipulators using only the haptic devices. The second task is to grasp an\nobject using both manipulators simultaneously, demonstrating precision and\ncoordination using the framework. The framework code is available at\nhttps://github.com/stevens-armlab/uvms_bimanual_sim.\n","authors":["Justin Sitler","Srikarran Sowrirajan","Brendan Englot","Long Wang"],"pdf_url":"https://arxiv.org/pdf/2404.03790v1.pdf","comment":"8 pages, 21st International Conference on Ubiquitous Robots (UR\n 2024), accepted"},{"id":"http://arxiv.org/abs/2404.03741v1","updated":"2024-04-04T18:27:45Z","published":"2024-04-04T18:27:45Z","title":"A High-Fidelity Simulation Framework for Grasping Stability Analysis in\n Human Casualty Manipulation","summary":" Recently, there has been a growing interest in rescue robots due to their\nvital role in addressing emergency scenarios and providing crucial support in\nchallenging or hazardous situations where human intervention is difficult.\nHowever, very few of these robots are capable of actively engaging with humans\nand undertaking physical manipulation tasks. This limitation is largely\nattributed to the absence of tools that can realistically simulate physical\ninteractions, especially the contact mechanisms between a robotic gripper and a\nhuman body. In this letter, we aim to address key limitations in current\ndevelopments towards robotic casualty manipulation. Firstly, we present an\nintegrative simulation framework for casualty manipulation. We adapt a finite\nelement method (FEM) tool into the grasping and manipulation scenario, and the\ndeveloped framework can provide accurate biomechanical reactions resulting from\nmanipulation. Secondly, we conduct a detailed assessment of grasping stability\nduring casualty grasping and manipulation simulations. To validate the\nnecessity and superior performance of the proposed high-fidelity simulation\nframework, we conducted a qualitative and quantitative comparison of grasping\nstability analyses between the proposed framework and the state-of-the-art\nmulti-body physics simulations. Through these efforts, we have taken the first\nstep towards a feasible solution for robotic casualty manipulation.\n","authors":["Qianwen Zhao","Rajarshi Roy","Chad Spurlock","Kevin Lister","Long Wang"],"pdf_url":"https://arxiv.org/pdf/2404.03741v1.pdf","comment":"8 pages, revision submitted to IEEE RA-L, under review"},{"id":"http://arxiv.org/abs/2404.03734v1","updated":"2024-04-04T18:03:39Z","published":"2024-04-04T18:03:39Z","title":"Legible and Proactive Robot Planning for Prosocial Human-Robot\n Interactions","summary":" Humans have a remarkable ability to fluently engage in joint collision\navoidance in crowded navigation tasks despite the complexities and\nuncertainties inherent in human behavior. Underlying these interactions is a\nmutual understanding that (i) individuals are prosocial, that is, there is\nequitable responsibility in avoiding collisions, and (ii) individuals should\nbehave legibly, that is, move in a way that clearly conveys their intent to\nreduce ambiguity in how they intend to avoid others. Toward building robots\nthat can safely and seamlessly interact with humans, we propose a general robot\ntrajectory planning framework for synthesizing legible and proactive behaviors\nand demonstrate that our robot planner naturally leads to prosocial\ninteractions. Specifically, we introduce the notion of a markup factor to\nincentivize legible and proactive behaviors and an inconvenience budget\nconstraint to ensure equitable collision avoidance responsibility. We evaluate\nour approach against well-established multi-agent planning algorithms and show\nthat using our approach produces safe, fluent, and prosocial interactions. We\ndemonstrate the real-time feasibility of our approach with human-in-the-loop\nsimulations. Project page can be found at https://uw-ctrl.github.io/phri/.\n","authors":["Jasper Geldenbott","Karen Leung"],"pdf_url":"https://arxiv.org/pdf/2404.03734v1.pdf","comment":"Accepted to IEEE International Conference on Robotics and Automation\n 2024"},{"id":"http://arxiv.org/abs/2404.03729v1","updated":"2024-04-04T18:00:15Z","published":"2024-04-04T18:00:15Z","title":"JUICER: Data-Efficient Imitation Learning for Robotic Assembly","summary":" While learning from demonstrations is powerful for acquiring visuomotor\npolicies, high-performance imitation without large demonstration datasets\nremains challenging for tasks requiring precise, long-horizon manipulation.\nThis paper proposes a pipeline for improving imitation learning performance\nwith a small human demonstration budget. We apply our approach to assembly\ntasks that require precisely grasping, reorienting, and inserting multiple\nparts over long horizons and multiple task phases. Our pipeline combines\nexpressive policy architectures and various techniques for dataset expansion\nand simulation-based data augmentation. These help expand dataset support and\nsupervise the model with locally corrective actions near bottleneck regions\nrequiring high precision. We demonstrate our pipeline on four furniture\nassembly tasks in simulation, enabling a manipulator to assemble up to five\nparts over nearly 2500 time steps directly from RGB images, outperforming\nimitation and data augmentation baselines.\n","authors":["Lars Ankile","Anthony Simeonov","Idan Shenfeld","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2404.03729v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.03658v1","updated":"2024-04-04T17:59:59Z","published":"2024-04-04T17:59:59Z","title":"Know Your Neighbors: Improving Single-View Reconstruction via Spatial\n Vision-Language Reasoning","summary":" Recovering the 3D scene geometry from a single view is a fundamental yet\nill-posed problem in computer vision. While classical depth estimation methods\ninfer only a 2.5D scene representation limited to the image plane, recent\napproaches based on radiance fields reconstruct a full 3D representation.\nHowever, these methods still struggle with occluded regions since inferring\ngeometry without visual observation requires (i) semantic knowledge of the\nsurroundings, and (ii) reasoning about spatial context. We propose KYN, a novel\nmethod for single-view scene reconstruction that reasons about semantic and\nspatial context to predict each point's density. We introduce a vision-language\nmodulation module to enrich point features with fine-grained semantic\ninformation. We aggregate point representations across the scene through a\nlanguage-guided spatial attention mechanism to yield per-point density\npredictions aware of the 3D semantic context. We show that KYN improves 3D\nshape recovery compared to predicting density for each 3D point in isolation.\nWe achieve state-of-the-art results in scene and object reconstruction on\nKITTI-360, and show improved zero-shot generalization compared to prior work.\nProject page: https://ruili3.github.io/kyn.\n","authors":["Rui Li","Tobias Fischer","Mattia Segu","Marc Pollefeys","Luc Van Gool","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2404.03658v1.pdf","comment":"CVPR 2024. Project page: https://ruili3.github.io/kyn"},{"id":"http://arxiv.org/abs/2404.03657v1","updated":"2024-04-04T17:59:58Z","published":"2024-04-04T17:59:58Z","title":"OW-VISCap: Open-World Video Instance Segmentation and Captioning","summary":" Open-world video instance segmentation is an important video understanding\ntask. Yet most methods either operate in a closed-world setting, require an\nadditional user-input, or use classic region-based proposals to identify never\nbefore seen objects. Further, these methods only assign a one-word label to\ndetected objects, and don't generate rich object-centric descriptions. They\nalso often suffer from highly overlapping predictions. To address these issues,\nwe propose Open-World Video Instance Segmentation and Captioning (OW-VISCap),\nan approach to jointly segment, track, and caption previously seen or unseen\nobjects in a video. For this, we introduce open-world object queries to\ndiscover never before seen objects without additional user-input. We generate\nrich and descriptive object-centric captions for each detected object via a\nmasked attention augmented LLM input. We introduce an inter-query contrastive\nloss to ensure that the object queries differ from one another. Our generalized\napproach matches or surpasses state-of-the-art on three tasks: open-world video\ninstance segmentation on the BURST dataset, dense video object captioning on\nthe VidSTG dataset, and closed-world video instance segmentation on the OVIS\ndataset.\n","authors":["Anwesa Choudhuri","Girish Chowdhary","Alexander G. Schwing"],"pdf_url":"https://arxiv.org/pdf/2404.03657v1.pdf","comment":"Project page: https://anwesachoudhuri.github.io/OpenWorldVISCap/"},{"id":"http://arxiv.org/abs/2404.03656v1","updated":"2024-04-04T17:59:57Z","published":"2024-04-04T17:59:57Z","title":"MVD-Fusion: Single-view 3D via Depth-consistent Multi-view Generation","summary":" We present MVD-Fusion: a method for single-view 3D inference via generative\nmodeling of multi-view-consistent RGB-D images. While recent methods pursuing\n3D inference advocate learning novel-view generative models, these generations\nare not 3D-consistent and require a distillation process to generate a 3D\noutput. We instead cast the task of 3D inference as directly generating\nmutually-consistent multiple views and build on the insight that additionally\ninferring depth can provide a mechanism for enforcing this consistency.\nSpecifically, we train a denoising diffusion model to generate multi-view RGB-D\nimages given a single RGB input image and leverage the (intermediate noisy)\ndepth estimates to obtain reprojection-based conditioning to maintain\nmulti-view consistency. We train our model using large-scale synthetic dataset\nObajverse as well as the real-world CO3D dataset comprising of generic camera\nviewpoints. We demonstrate that our approach can yield more accurate synthesis\ncompared to recent state-of-the-art, including distillation-based 3D inference\nand prior multi-view generation methods. We also evaluate the geometry induced\nby our multi-view depth prediction and find that it yields a more accurate\nrepresentation than other direct 3D inference approaches.\n","authors":["Hanzhe Hu","Zhizhuo Zhou","Varun Jampani","Shubham Tulsiani"],"pdf_url":"https://arxiv.org/pdf/2404.03656v1.pdf","comment":"Project page: https://mvd-fusion.github.io/"},{"id":"http://arxiv.org/abs/2404.03654v1","updated":"2024-04-04T17:59:50Z","published":"2024-04-04T17:59:50Z","title":"RaFE: Generative Radiance Fields Restoration","summary":" NeRF (Neural Radiance Fields) has demonstrated tremendous potential in novel\nview synthesis and 3D reconstruction, but its performance is sensitive to input\nimage quality, which struggles to achieve high-fidelity rendering when provided\nwith low-quality sparse input viewpoints. Previous methods for NeRF restoration\nare tailored for specific degradation type, ignoring the generality of\nrestoration. To overcome this limitation, we propose a generic radiance fields\nrestoration pipeline, named RaFE, which applies to various types of\ndegradations, such as low resolution, blurriness, noise, compression artifacts,\nor their combinations. Our approach leverages the success of off-the-shelf 2D\nrestoration methods to recover the multi-view images individually. Instead of\nreconstructing a blurred NeRF by averaging inconsistencies, we introduce a\nnovel approach using Generative Adversarial Networks (GANs) for NeRF generation\nto better accommodate the geometric and appearance inconsistencies present in\nthe multi-view images. Specifically, we adopt a two-level tri-plane\narchitecture, where the coarse level remains fixed to represent the low-quality\nNeRF, and a fine-level residual tri-plane to be added to the coarse level is\nmodeled as a distribution with GAN to capture potential variations in\nrestoration. We validate RaFE on both synthetic and real cases for various\nrestoration tasks, demonstrating superior performance in both quantitative and\nqualitative evaluations, surpassing other 3D restoration methods specific to\nsingle task. Please see our project website\nhttps://zkaiwu.github.io/RaFE-Project/.\n","authors":["Zhongkai Wu","Ziyu Wan","Jing Zhang","Jing Liao","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2404.03654v1.pdf","comment":"Project Page: https://zkaiwu.github.io/RaFE-Project/"},{"id":"http://arxiv.org/abs/2404.03653v1","updated":"2024-04-04T17:59:46Z","published":"2024-04-04T17:59:46Z","title":"CoMat: Aligning Text-to-Image Diffusion Model with Image-to-Text Concept\n Matching","summary":" Diffusion models have demonstrated great success in the field of\ntext-to-image generation. However, alleviating the misalignment between the\ntext prompts and images is still challenging. The root reason behind the\nmisalignment has not been extensively investigated. We observe that the\nmisalignment is caused by inadequate token attention activation. We further\nattribute this phenomenon to the diffusion model's insufficient condition\nutilization, which is caused by its training paradigm. To address the issue, we\npropose CoMat, an end-to-end diffusion model fine-tuning strategy with an\nimage-to-text concept matching mechanism. We leverage an image captioning model\nto measure image-to-text alignment and guide the diffusion model to revisit\nignored tokens. A novel attribute concentration module is also proposed to\naddress the attribute binding problem. Without any image or human preference\ndata, we use only 20K text prompts to fine-tune SDXL to obtain CoMat-SDXL.\nExtensive experiments show that CoMat-SDXL significantly outperforms the\nbaseline model SDXL in two text-to-image alignment benchmarks and achieves\nstart-of-the-art performance.\n","authors":["Dongzhi Jiang","Guanglu Song","Xiaoshi Wu","Renrui Zhang","Dazhong Shen","Zhuofan Zong","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.03653v1.pdf","comment":"Project Page: https://caraj7.github.io/comat"},{"id":"http://arxiv.org/abs/2404.03652v1","updated":"2024-04-04T17:59:40Z","published":"2024-04-04T17:59:40Z","title":"The More You See in 2D, the More You Perceive in 3D","summary":" Humans can infer 3D structure from 2D images of an object based on past\nexperience and improve their 3D understanding as they see more images. Inspired\nby this behavior, we introduce SAP3D, a system for 3D reconstruction and novel\nview synthesis from an arbitrary number of unposed images. Given a few unposed\nimages of an object, we adapt a pre-trained view-conditioned diffusion model\ntogether with the camera poses of the images via test-time fine-tuning. The\nadapted diffusion model and the obtained camera poses are then utilized as\ninstance-specific priors for 3D reconstruction and novel view synthesis. We\nshow that as the number of input images increases, the performance of our\napproach improves, bridging the gap between optimization-based prior-less 3D\nreconstruction methods and single-image-to-3D diffusion-based methods. We\ndemonstrate our system on real images as well as standard synthetic benchmarks.\nOur ablation studies confirm that this adaption behavior is key for more\naccurate 3D understanding.\n","authors":["Xinyang Han","Zelin Gao","Angjoo Kanazawa","Shubham Goel","Yossi Gandelsman"],"pdf_url":"https://arxiv.org/pdf/2404.03652v1.pdf","comment":"Project page: https://sap3d.github.io/"},{"id":"http://arxiv.org/abs/2404.03650v1","updated":"2024-04-04T17:59:08Z","published":"2024-04-04T17:59:08Z","title":"OpenNeRF: Open Set 3D Neural Scene Segmentation with Pixel-Wise Features\n and Rendered Novel Views","summary":" Large visual-language models (VLMs), like CLIP, enable open-set image\nsegmentation to segment arbitrary concepts from an image in a zero-shot manner.\nThis goes beyond the traditional closed-set assumption, i.e., where models can\nonly segment classes from a pre-defined training set. More recently, first\nworks on open-set segmentation in 3D scenes have appeared in the literature.\nThese methods are heavily influenced by closed-set 3D convolutional approaches\nthat process point clouds or polygon meshes. However, these 3D scene\nrepresentations do not align well with the image-based nature of the\nvisual-language models. Indeed, point cloud and 3D meshes typically have a\nlower resolution than images and the reconstructed 3D scene geometry might not\nproject well to the underlying 2D image sequences used to compute pixel-aligned\nCLIP features. To address these challenges, we propose OpenNeRF which naturally\noperates on posed images and directly encodes the VLM features within the NeRF.\nThis is similar in spirit to LERF, however our work shows that using pixel-wise\nVLM features (instead of global CLIP features) results in an overall less\ncomplex architecture without the need for additional DINO regularization. Our\nOpenNeRF further leverages NeRF's ability to render novel views and extract\nopen-set VLM features from areas that are not well observed in the initial\nposed images. For 3D point cloud segmentation on the Replica dataset, OpenNeRF\noutperforms recent open-vocabulary methods such as LERF and OpenScene by at\nleast +4.9 mIoU.\n","authors":["Francis Engelmann","Fabian Manhardt","Michael Niemeyer","Keisuke Tateno","Marc Pollefeys","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2404.03650v1.pdf","comment":"ICLR 2024, Project page: https://opennerf.github.io"},{"id":"http://arxiv.org/abs/2404.03645v1","updated":"2024-04-04T17:58:21Z","published":"2024-04-04T17:58:21Z","title":"Decoupling Static and Hierarchical Motion Perception for Referring Video\n Segmentation","summary":" Referring video segmentation relies on natural language expressions to\nidentify and segment objects, often emphasizing motion clues. Previous works\ntreat a sentence as a whole and directly perform identification at the\nvideo-level, mixing up static image-level cues with temporal motion cues.\nHowever, image-level features cannot well comprehend motion cues in sentences,\nand static cues are not crucial for temporal perception. In fact, static cues\ncan sometimes interfere with temporal perception by overshadowing motion cues.\nIn this work, we propose to decouple video-level referring expression\nunderstanding into static and motion perception, with a specific emphasis on\nenhancing temporal comprehension. Firstly, we introduce an\nexpression-decoupling module to make static cues and motion cues perform their\ndistinct role, alleviating the issue of sentence embeddings overlooking motion\ncues. Secondly, we propose a hierarchical motion perception module to capture\ntemporal information effectively across varying timescales. Furthermore, we\nemploy contrastive learning to distinguish the motions of visually similar\nobjects. These contributions yield state-of-the-art performance across five\ndatasets, including a remarkable $\\textbf{9.2%}$ $\\mathcal{J\\&F}$ improvement\non the challenging $\\textbf{MeViS}$ dataset. Code is available at\nhttps://github.com/heshuting555/DsHmp.\n","authors":["Shuting He","Henghui Ding"],"pdf_url":"https://arxiv.org/pdf/2404.03645v1.pdf","comment":"CVPR 2024, code: https://github.com/heshuting555/DsHmp"},{"id":"http://arxiv.org/abs/2404.03642v1","updated":"2024-04-04T17:57:25Z","published":"2024-04-04T17:57:25Z","title":"DiffBody: Human Body Restoration by Imagining with Generative Diffusion\n Prior","summary":" Human body restoration plays a vital role in various applications related to\nthe human body. Despite recent advances in general image restoration using\ngenerative models, their performance in human body restoration remains\nmediocre, often resulting in foreground and background blending, over-smoothing\nsurface textures, missing accessories, and distorted limbs. Addressing these\nchallenges, we propose a novel approach by constructing a human body-aware\ndiffusion model that leverages domain-specific knowledge to enhance\nperformance. Specifically, we employ a pretrained body attention module to\nguide the diffusion model's focus on the foreground, addressing issues caused\nby blending between the subject and background. We also demonstrate the value\nof revisiting the language modality of the diffusion model in restoration tasks\nby seamlessly incorporating text prompt to improve the quality of surface\ntexture and additional clothing and accessories details. Additionally, we\nintroduce a diffusion sampler tailored for fine-grained human body parts,\nutilizing local semantic information to rectify limb distortions. Lastly, we\ncollect a comprehensive dataset for benchmarking and advancing the field of\nhuman body restoration. Extensive experimental validation showcases the\nsuperiority of our approach, both quantitatively and qualitatively, over\nexisting methods.\n","authors":["Yiming Zhang","Zhe Wang","Xinjie Li","Yunchen Yuan","Chengsong Zhang","Xiao Sun","Zhihang Zhong","Jian Wang"],"pdf_url":"https://arxiv.org/pdf/2404.03642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12790v3","updated":"2024-04-04T17:55:04Z","published":"2023-03-22T17:58:01Z","title":"$CrowdDiff$: Multi-hypothesis Crowd Density Estimation using Diffusion\n Models","summary":" Crowd counting is a fundamental problem in crowd analysis which is typically\naccomplished by estimating a crowd density map and summing over the density\nvalues. However, this approach suffers from background noise accumulation and\nloss of density due to the use of broad Gaussian kernels to create the ground\ntruth density maps. This issue can be overcome by narrowing the Gaussian\nkernel. However, existing approaches perform poorly when trained with ground\ntruth density maps with broad kernels. To deal with this limitation, we propose\nusing conditional diffusion models to predict density maps, as diffusion models\nshow high fidelity to training data during generation. With that, we present\n$CrowdDiff$ that generates the crowd density map as a reverse diffusion\nprocess. Furthermore, as the intermediate time steps of the diffusion process\nare noisy, we incorporate a regression branch for direct crowd estimation only\nduring training to improve the feature learning. In addition, owing to the\nstochastic nature of the diffusion model, we introduce producing multiple\ndensity maps to improve the counting performance contrary to the existing crowd\ncounting pipelines. We conduct extensive experiments on publicly available\ndatasets to validate the effectiveness of our method. $CrowdDiff$ outperforms\nexisting state-of-the-art crowd counting methods on several public crowd\nanalysis benchmarks with significant improvements.\n","authors":["Yasiru Ranasinghe","Nithin Gopalakrishnan Nair","Wele Gedara Chaminda Bandara","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2303.12790v3.pdf","comment":"Accepted at CVPR'24. The project is available at\n https://dylran.github.io/crowddiff.github.io"},{"id":"http://arxiv.org/abs/2404.03635v1","updated":"2024-04-04T17:54:33Z","published":"2024-04-04T17:54:33Z","title":"WorDepth: Variational Language Prior for Monocular Depth Estimation","summary":" Three-dimensional (3D) reconstruction from a single image is an ill-posed\nproblem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text\ndescription(s) is similarly ill-posed, i.e. spatial arrangements of objects\ndescribed. We investigate the question of whether two inherently ambiguous\nmodalities can be used in conjunction to produce metric-scaled reconstructions.\nTo test this, we focus on monocular depth estimation, the problem of predicting\na dense depth map from a single image, but with an additional text caption\ndescribing the scene. To this end, we begin by encoding the text caption as a\nmean and standard deviation; using a variational framework, we learn the\ndistribution of the plausible metric reconstructions of 3D scenes corresponding\nto the text captions as a prior. To \"select\" a specific reconstruction or depth\nmap, we encode the given image through a conditional sampler that samples from\nthe latent space of the variational text encoder, which is then decoded to the\noutput depth map. Our approach is trained alternatingly between the text and\nimage branches: in one optimization step, we predict the mean and standard\ndeviation from the text description and sample from a standard Gaussian, and in\nthe other, we sample using a (image) conditional sampler. Once trained, we\ndirectly predict depth from the encoded text using the conditional sampler. We\ndemonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where\nwe show that language can consistently improve performance in both.\n","authors":["Ziyao Zeng","Daniel Wang","Fengyu Yang","Hyoungseob Park","Yangchao Wu","Stefano Soatto","Byung-Woo Hong","Dong Lao","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2404.03635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03634v1","updated":"2024-04-04T17:54:12Z","published":"2024-04-04T17:54:12Z","title":"PreAfford: Universal Affordance-Based Pre-Grasping for Diverse Objects\n and Environments","summary":" Robotic manipulation of ungraspable objects with two-finger grippers presents\nsignificant challenges due to the paucity of graspable features, while\ntraditional pre-grasping techniques, which rely on repositioning objects and\nleveraging external aids like table edges, lack the adaptability across object\ncategories and scenes. Addressing this, we introduce PreAfford, a novel\npre-grasping planning framework that utilizes a point-level affordance\nrepresentation and a relay training approach to enhance adaptability across a\nbroad range of environments and object types, including those previously\nunseen. Demonstrated on the ShapeNet-v2 dataset, PreAfford significantly\nimproves grasping success rates by 69% and validates its practicality through\nreal-world experiments. This work offers a robust and adaptable solution for\nmanipulating ungraspable objects.\n","authors":["Kairui Ding","Boyuan Chen","Ruihai Wu","Yuyang Li","Zongzheng Zhang","Huan-ang Gao","Siqi Li","Yixin Zhu","Guyue Zhou","Hao Dong","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.03634v1.pdf","comment":"Project Page: https://air-discover.github.io/PreAfford/"},{"id":"http://arxiv.org/abs/2404.03632v1","updated":"2024-04-04T17:53:33Z","published":"2024-04-04T17:53:33Z","title":"Reference-Based 3D-Aware Image Editing with Triplane","summary":" Generative Adversarial Networks (GANs) have emerged as powerful tools not\nonly for high-quality image generation but also for real image editing through\nmanipulation of their interpretable latent spaces. Recent advancements in GANs\ninclude the development of 3D-aware models such as EG3D, characterized by\nefficient triplane-based architectures enabling the reconstruction of 3D\ngeometry from single images. However, scant attention has been devoted to\nproviding an integrated framework for high-quality reference-based 3D-aware\nimage editing within this domain. This study addresses this gap by exploring\nand demonstrating the effectiveness of EG3D's triplane space for achieving\nadvanced reference-based edits, presenting a unique perspective on 3D-aware\nimage editing through our novel pipeline. Our approach integrates the encoding\nof triplane features, spatial disentanglement and automatic localization of\nfeatures in the triplane domain, and fusion learning for desired image editing.\nMoreover, our framework demonstrates versatility across domains, extending its\neffectiveness to animal face edits and partial stylization of cartoon\nportraits. The method shows significant improvements over relevant 3D-aware\nlatent editing and 2D reference-based editing methods, both qualitatively and\nquantitatively. Project page: https://three-bee.github.io/triplane_edit\n","authors":["Bahri Batuhan Bilecen","Yigit Yalin","Ning Yu","Aysegul Dundar"],"pdf_url":"https://arxiv.org/pdf/2404.03632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03631v1","updated":"2024-04-04T17:52:13Z","published":"2024-04-04T17:52:13Z","title":"Robust Concept Erasure Using Task Vectors","summary":" With the rapid growth of text-to-image models, a variety of techniques have\nbeen suggested to prevent undesirable image generations. Yet, these methods\noften only protect against specific user prompts and have been shown to allow\nunsafe generations with other inputs. Here we focus on unconditionally erasing\na concept from a text-to-image model rather than conditioning the erasure on\nthe user's prompt. We first show that compared to input-dependent erasure\nmethods, concept erasure that uses Task Vectors (TV) is more robust to\nunexpected user inputs, not seen during training. However, TV-based erasure can\nalso affect the core performance of the edited model, particularly when the\nrequired edit strength is unknown. To this end, we propose a method called\nDiverse Inversion, which we use to estimate the required strength of the TV\nedit. Diverse Inversion finds within the model input space a large set of word\nembeddings, each of which induces the generation of the target concept. We find\nthat encouraging diversity in the set makes our estimation more robust to\nunexpected prompts. Finally, we show that Diverse Inversion enables us to apply\na TV edit only to a subset of the model weights, enhancing the erasure\ncapabilities while better maintaining the core functionality of the model.\n","authors":["Minh Pham","Kelly O. Marshall","Chinmay Hegde","Niv Cohen"],"pdf_url":"https://arxiv.org/pdf/2404.03631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03620v1","updated":"2024-04-04T17:43:06Z","published":"2024-04-04T17:43:06Z","title":"LCM-Lookahead for Encoder-based Text-to-Image Personalization","summary":" Recent advancements in diffusion models have introduced fast sampling methods\nthat can effectively produce high-quality images in just one or a few denoising\nsteps. Interestingly, when these are distilled from existing diffusion models,\nthey often maintain alignment with the original model, retaining similar\noutputs for similar prompts and seeds. These properties present opportunities\nto leverage fast sampling methods as a shortcut-mechanism, using them to create\na preview of denoised outputs through which we can backpropagate image-space\nlosses. In this work, we explore the potential of using such\nshortcut-mechanisms to guide the personalization of text-to-image models to\nspecific facial identities. We focus on encoder-based personalization\napproaches, and demonstrate that by tuning them with a lookahead identity loss,\nwe can achieve higher identity fidelity, without sacrificing layout diversity\nor prompt alignment. We further explore the use of attention sharing mechanisms\nand consistent data generation for the task of personalization, and find that\nencoder training can benefit from both.\n","authors":["Rinon Gal","Or Lichter","Elad Richardson","Or Patashnik","Amit H. Bermano","Gal Chechik","Daniel Cohen-Or"],"pdf_url":"https://arxiv.org/pdf/2404.03620v1.pdf","comment":"Project page at https://lcm-lookahead.github.io/"},{"id":"http://arxiv.org/abs/2404.03618v1","updated":"2024-04-04T17:40:06Z","published":"2024-04-04T17:40:06Z","title":"DeViDe: Faceted medical knowledge for improved medical vision-language\n pre-training","summary":" Vision-language pre-training for chest X-rays has made significant strides,\nprimarily by utilizing paired radiographs and radiology reports. However,\nexisting approaches often face challenges in encoding medical knowledge\neffectively. While radiology reports provide insights into the current disease\nmanifestation, medical definitions (as used by contemporary methods) tend to be\noverly abstract, creating a gap in knowledge. To address this, we propose\nDeViDe, a novel transformer-based method that leverages radiographic\ndescriptions from the open web. These descriptions outline general visual\ncharacteristics of diseases in radiographs, and when combined with abstract\ndefinitions and radiology reports, provide a holistic snapshot of knowledge.\nDeViDe incorporates three key features for knowledge-augmented vision language\nalignment: First, a large-language model-based augmentation is employed to\nhomogenise medical knowledge from diverse sources. Second, this knowledge is\naligned with image information at various levels of granularity. Third, a novel\nprojection layer is proposed to handle the complexity of aligning each image\nwith multiple descriptions arising in a multi-label setting. In zero-shot\nsettings, DeViDe performs comparably to fully supervised models on external\ndatasets and achieves state-of-the-art results on three large-scale datasets.\nAdditionally, fine-tuning DeViDe on four downstream tasks and six segmentation\ntasks showcases its superior performance across data from diverse\ndistributions.\n","authors":["Haozhe Luo","Ziyu Zhou","Corentin Royer","Anjany Sekuboyina","Bjoern Menze"],"pdf_url":"https://arxiv.org/pdf/2404.03618v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2208.04060 by other authors"},{"id":"http://arxiv.org/abs/2404.03617v1","updated":"2024-04-04T17:39:41Z","published":"2024-04-04T17:39:41Z","title":"On the Efficiency of Convolutional Neural Networks","summary":" Since the breakthrough performance of AlexNet in 2012, convolutional neural\nnetworks (convnets) have grown into extremely powerful vision models. Deep\nlearning researchers have used convnets to produce accurate results that were\nunachievable a decade ago. Yet computer scientists make computational\nefficiency their primary objective. Accuracy with exorbitant cost is not\nacceptable; an algorithm must also minimize its computational requirements.\nConfronted with the daunting computation that convnets use, deep learning\nresearchers also became interested in efficiency. Researchers applied\ntremendous effort to find the convnet architectures that have the greatest\nefficiency. However, skepticism grew among researchers and engineers alike\nabout the relevance of arithmetic complexity. Contrary to the prevailing view\nthat latency and arithmetic complexity are irreconcilable, a simple formula\nrelates both through computational efficiency. This insight enabled us to\nco-optimize the separate factors that determine latency. We observed that the\ndegenerate conv2d layers that produce the best accuracy-complexity trade-off\nalso have low operational intensity. Therefore, kernels that implement these\nlayers use significant memory resources. We solved this optimization problem\nwith block-fusion kernels that implement all layers of a residual block,\nthereby creating temporal locality, avoiding communication, and reducing\nworkspace size. Our ConvFirst model with block-fusion kernels ran approximately\nfour times as fast as the ConvNeXt baseline with PyTorch Inductor, at equal\naccuracy on the ImageNet-1K classification task. Our unified approach to\nconvnet efficiency envisions a new era of models and kernels that achieve\ngreater accuracy at lower cost.\n","authors":["Andrew Lavin"],"pdf_url":"https://arxiv.org/pdf/2404.03617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03613v1","updated":"2024-04-04T17:34:41Z","published":"2024-04-04T17:34:41Z","title":"Per-Gaussian Embedding-Based Deformation for Deformable 3D Gaussian\n Splatting","summary":" As 3D Gaussian Splatting (3DGS) provides fast and high-quality novel view\nsynthesis, it is a natural extension to deform a canonical 3DGS to multiple\nframes. However, previous works fail to accurately reconstruct dynamic scenes,\nespecially 1) static parts moving along nearby dynamic parts, and 2) some\ndynamic areas are blurry. We attribute the failure to the wrong design of the\ndeformation field, which is built as a coordinate-based function. This approach\nis problematic because 3DGS is a mixture of multiple fields centered at the\nGaussians, not just a single coordinate-based framework. To resolve this\nproblem, we define the deformation as a function of per-Gaussian embeddings and\ntemporal embeddings. Moreover, we decompose deformations as coarse and fine\ndeformations to model slow and fast movements, respectively. Also, we introduce\nan efficient training strategy for faster convergence and higher quality.\nProject page: https://jeongminb.github.io/e-d3dgs/\n","authors":["Jeongmin Bae","Seoha Kim","Youngsik Yun","Hahyun Lee","Gun Bang","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2404.03613v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2404.03611v1","updated":"2024-04-04T17:34:21Z","published":"2024-04-04T17:34:21Z","title":"InsectMamba: Insect Pest Classification with State Space Model","summary":" The classification of insect pests is a critical task in agricultural\ntechnology, vital for ensuring food security and environmental sustainability.\nHowever, the complexity of pest identification, due to factors like high\ncamouflage and species diversity, poses significant obstacles. Existing methods\nstruggle with the fine-grained feature extraction needed to distinguish between\nclosely related pest species. Although recent advancements have utilized\nmodified network structures and combined deep learning approaches to improve\naccuracy, challenges persist due to the similarity between pests and their\nsurroundings. To address this problem, we introduce InsectMamba, a novel\napproach that integrates State Space Models (SSMs), Convolutional Neural\nNetworks (CNNs), Multi-Head Self-Attention mechanism (MSA), and Multilayer\nPerceptrons (MLPs) within Mix-SSM blocks. This integration facilitates the\nextraction of comprehensive visual features by leveraging the strengths of each\nencoding strategy. A selective module is also proposed to adaptively aggregate\nthese features, enhancing the model's ability to discern pest characteristics.\nInsectMamba was evaluated against strong competitors across five insect pest\nclassification datasets. The results demonstrate its superior performance and\nverify the significance of each model component by an ablation study.\n","authors":["Qianning Wang","Chenglin Wang","Zhixin Lai","Yucheng Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.03611v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.03590v1","updated":"2024-04-04T16:58:26Z","published":"2024-04-04T16:58:26Z","title":"SemGrasp: Semantic Grasp Generation via Language Aligned Discretization","summary":" Generating natural human grasps necessitates consideration of not just object\ngeometry but also semantic information. Solely depending on object shape for\ngrasp generation confines the applications of prior methods in downstream\ntasks. This paper presents a novel semantic-based grasp generation method,\ntermed SemGrasp, which generates a static human grasp pose by incorporating\nsemantic information into the grasp representation. We introduce a discrete\nrepresentation that aligns the grasp space with semantic space, enabling the\ngeneration of grasp postures in accordance with language instructions. A\nMultimodal Large Language Model (MLLM) is subsequently fine-tuned, integrating\nobject, grasp, and language within a unified semantic space. To facilitate the\ntraining of SemGrasp, we have compiled a large-scale, grasp-text-aligned\ndataset named CapGrasp, featuring about 260k detailed captions and 50k diverse\ngrasps. Experimental findings demonstrate that SemGrasp efficiently generates\nnatural human grasps in alignment with linguistic intentions. Our code, models,\nand dataset are available publicly at: https://kailinli.github.io/SemGrasp.\n","authors":["Kailin Li","Jingbo Wang","Lixin Yang","Cewu Lu","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2404.03590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03584v1","updated":"2024-04-04T16:48:40Z","published":"2024-04-04T16:48:40Z","title":"Towards more realistic human motion prediction with attention to motion\n coordination","summary":" Joint relation modeling is a curial component in human motion prediction.\nMost existing methods rely on skeletal-based graphs to build the joint\nrelations, where local interactive relations between joint pairs are well\nlearned. However, the motion coordination, a global joint relation reflecting\nthe simultaneous cooperation of all joints, is usually weakened because it is\nlearned from part to whole progressively and asynchronously. Thus, the final\npredicted motions usually appear unrealistic. To tackle this issue, we learn a\nmedium, called coordination attractor (CA), from the spatiotemporal features of\nmotion to characterize the global motion features, which is subsequently used\nto build new relative joint relations. Through the CA, all joints are related\nsimultaneously, and thus the motion coordination of all joints can be better\nlearned. Based on this, we further propose a novel joint relation modeling\nmodule, Comprehensive Joint Relation Extractor (CJRE), to combine this motion\ncoordination with the local interactions between joint pairs in a unified\nmanner. Additionally, we also present a Multi-timescale Dynamics Extractor\n(MTDE) to extract enriched dynamics from the raw position information for\neffective prediction. Extensive experiments show that the proposed framework\noutperforms state-of-the-art methods in both short- and long-term predictions\non H3.6M, CMU-Mocap, and 3DPW.\n","authors":["Pengxiang Ding","Jianqin Yin"],"pdf_url":"https://arxiv.org/pdf/2404.03584v1.pdf","comment":"Accepted by TCSVT"},{"id":"http://arxiv.org/abs/2312.11972v2","updated":"2024-04-04T16:41:22Z","published":"2023-12-19T09:09:46Z","title":"Expressive Forecasting of 3D Whole-body Human Motions","summary":" Human motion forecasting, with the goal of estimating future human behavior\nover a period of time, is a fundamental task in many real-world applications.\nHowever, existing works typically concentrate on predicting the major joints of\nthe human body without considering the delicate movements of the human hands.\nIn practical applications, hand gesture plays an important role in human\ncommunication with the real world, and expresses the primary intention of human\nbeings. In this work, we are the first to formulate a whole-body human pose\nforecasting task, which jointly predicts the future body and hand activities.\nCorrespondingly, we propose a novel Encoding-Alignment-Interaction (EAI)\nframework that aims to predict both coarse (body joints) and fine-grained\n(gestures) activities collaboratively, enabling expressive and\ncross-facilitated forecasting of 3D whole-body human motions. Specifically, our\nmodel involves two key constituents: cross-context alignment (XCA) and\ncross-context interaction (XCI). Considering the heterogeneous information\nwithin the whole-body, XCA aims to align the latent features of various human\ncomponents, while XCI focuses on effectively capturing the context interaction\namong the human components. We conduct extensive experiments on a\nnewly-introduced large-scale benchmark and achieve state-of-the-art\nperformance. The code is public for research purposes at\nhttps://github.com/Dingpx/EAI.\n","authors":["Pengxiang Ding","Qiongjie Cui","Min Zhang","Mengyuan Liu","Haofan Wang","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.11972v2.pdf","comment":"Accepted by AAAI24"},{"id":"http://arxiv.org/abs/2404.03575v1","updated":"2024-04-04T16:38:57Z","published":"2024-04-04T16:38:57Z","title":"DreamScene: 3D Gaussian-based Text-to-3D Scene Generation via Formation\n Pattern Sampling","summary":" Text-to-3D scene generation holds immense potential for the gaming, film, and\narchitecture sectors. Despite significant progress, existing methods struggle\nwith maintaining high quality, consistency, and editing flexibility. In this\npaper, we propose DreamScene, a 3D Gaussian-based novel text-to-3D scene\ngeneration framework, to tackle the aforementioned three challenges mainly via\ntwo strategies. First, DreamScene employs Formation Pattern Sampling (FPS), a\nmulti-timestep sampling strategy guided by the formation patterns of 3D\nobjects, to form fast, semantically rich, and high-quality representations. FPS\nuses 3D Gaussian filtering for optimization stability, and leverages\nreconstruction techniques to generate plausible textures. Second, DreamScene\nemploys a progressive three-stage camera sampling strategy, specifically\ndesigned for both indoor and outdoor settings, to effectively ensure\nobject-environment integration and scene-wide 3D consistency. Last, DreamScene\nenhances scene editing flexibility by integrating objects and environments,\nenabling targeted adjustments. Extensive experiments validate DreamScene's\nsuperiority over current state-of-the-art techniques, heralding its\nwide-ranging potential for diverse applications. Code and demos will be\nreleased at https://dreamscene-project.github.io .\n","authors":["Haoran Li","Haolin Shi","Wenli Zhang","Wenjun Wu","Yong Liao","Lin Wang","Lik-hang Lee","Pengyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.03575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03574v1","updated":"2024-04-04T16:38:49Z","published":"2024-04-04T16:38:49Z","title":"TinyVQA: Compact Multimodal Deep Neural Network for Visual Question\n Answering on Resource-Constrained Devices","summary":" Traditional machine learning models often require powerful hardware, making\nthem unsuitable for deployment on resource-limited devices. Tiny Machine\nLearning (tinyML) has emerged as a promising approach for running machine\nlearning models on these devices, but integrating multiple data modalities into\ntinyML models still remains a challenge due to increased complexity, latency,\nand power consumption. This paper proposes TinyVQA, a novel multimodal deep\nneural network for visual question answering tasks that can be deployed on\nresource-constrained tinyML hardware. TinyVQA leverages a supervised\nattention-based model to learn how to answer questions about images using both\nvision and language modalities. Distilled knowledge from the supervised\nattention-based VQA model trains the memory aware compact TinyVQA model and low\nbit-width quantization technique is employed to further compress the model for\ndeployment on tinyML devices. The TinyVQA model was evaluated on the FloodNet\ndataset, which is used for post-disaster damage assessment. The compact model\nachieved an accuracy of 79.5%, demonstrating the effectiveness of TinyVQA for\nreal-world applications. Additionally, the model was deployed on a Crazyflie\n2.0 drone, equipped with an AI deck and GAP8 microprocessor. The TinyVQA model\nachieved low latencies of 56 ms and consumes 693 mW power while deployed on the\ntiny drone, showcasing its suitability for resource-constrained embedded\nsystems.\n","authors":["Hasib-Al Rashid","Argho Sarkar","Aryya Gangopadhyay","Maryam Rahnemoonfar","Tinoosh Mohsenin"],"pdf_url":"https://arxiv.org/pdf/2404.03574v1.pdf","comment":"Accepted as a full paper by the tinyML Research Symposium 2024"},{"id":"http://arxiv.org/abs/2404.03572v1","updated":"2024-04-04T16:37:42Z","published":"2024-04-04T16:37:42Z","title":"Terrain Point Cloud Inpainting via Signal Decomposition","summary":" The rapid development of 3D acquisition technology has made it possible to\nobtain point clouds of real-world terrains. However, due to limitations in\nsensor acquisition technology or specific requirements, point clouds often\ncontain defects such as holes with missing data. Inpainting algorithms are\nwidely used to patch these holes. However, existing traditional inpainting\nalgorithms rely on precise hole boundaries, which limits their ability to\nhandle cases where the boundaries are not well-defined. On the other hand,\nlearning-based completion methods often prioritize reconstructing the entire\npoint cloud instead of solely focusing on hole filling. Based on the fact that\nreal-world terrain exhibits both global smoothness and rich local detail, we\npropose a novel representation for terrain point clouds. This representation\ncan help to repair the holes without clear boundaries. Specifically, it\ndecomposes terrains into low-frequency and high-frequency components, which are\nrepresented by B-spline surfaces and relative height maps respectively. In this\nway, the terrain point cloud inpainting problem is transformed into a B-spline\nsurface fitting and 2D image inpainting problem. By solving the two problems,\nthe highly complex and irregular holes on the terrain point clouds can be\nwell-filled, which not only satisfies the global terrain undulation but also\nexhibits rich geometric details. The experimental results also demonstrate the\neffectiveness of our method.\n","authors":["Yizhou Xie","Xiangning Xie","Yuran Wang","Yanci Zhang","Zejun Lv"],"pdf_url":"https://arxiv.org/pdf/2404.03572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14817v3","updated":"2024-04-04T16:27:06Z","published":"2024-02-22T18:59:56Z","title":"Cameras as Rays: Pose Estimation via Ray Diffusion","summary":" Estimating camera poses is a fundamental task for 3D reconstruction and\nremains challenging given sparsely sampled views (<10). In contrast to existing\napproaches that pursue top-down prediction of global parametrizations of camera\nextrinsics, we propose a distributed representation of camera pose that treats\na camera as a bundle of rays. This representation allows for a tight coupling\nwith spatial image features improving pose precision. We observe that this\nrepresentation is naturally suited for set-level transformers and develop a\nregression-based approach that maps image patches to corresponding rays. To\ncapture the inherent uncertainties in sparse-view pose inference, we adapt this\napproach to learn a denoising diffusion model which allows us to sample\nplausible modes while improving performance. Our proposed methods, both\nregression- and diffusion-based, demonstrate state-of-the-art performance on\ncamera pose estimation on CO3D while generalizing to unseen object categories\nand in-the-wild captures.\n","authors":["Jason Y. Zhang","Amy Lin","Moneish Kumar","Tzu-Hsuan Yang","Deva Ramanan","Shubham Tulsiani"],"pdf_url":"https://arxiv.org/pdf/2402.14817v3.pdf","comment":"In ICLR 2024 (oral). v2-3: updated references. Project webpage:\n https://jasonyzhang.com/RayDiffusion"},{"id":"http://arxiv.org/abs/2404.03566v1","updated":"2024-04-04T16:24:32Z","published":"2024-04-04T16:24:32Z","title":"PointInfinity: Resolution-Invariant Point Diffusion Models","summary":" We present PointInfinity, an efficient family of point cloud diffusion\nmodels. Our core idea is to use a transformer-based architecture with a\nfixed-size, resolution-invariant latent representation. This enables efficient\ntraining with low-resolution point clouds, while allowing high-resolution point\nclouds to be generated during inference. More importantly, we show that scaling\nthe test-time resolution beyond the training resolution improves the fidelity\nof generated point clouds and surfaces. We analyze this phenomenon and draw a\nlink to classifier-free guidance commonly used in diffusion models,\ndemonstrating that both allow trading off fidelity and variability during\ninference. Experiments on CO3D show that PointInfinity can efficiently generate\nhigh-resolution point clouds (up to 131k points, 31 times more than Point-E)\nwith state-of-the-art quality.\n","authors":["Zixuan Huang","Justin Johnson","Shoubhik Debnath","James M. Rehg","Chao-Yuan Wu"],"pdf_url":"https://arxiv.org/pdf/2404.03566v1.pdf","comment":"Accepted to CVPR 2024, project website at\n https://zixuanh.com/projects/pointinfinity"},{"id":"http://arxiv.org/abs/2403.01598v2","updated":"2024-04-04T16:12:51Z","published":"2024-03-03T19:52:43Z","title":"APISR: Anime Production Inspired Real-World Anime Super-Resolution","summary":" While real-world anime super-resolution (SR) has gained increasing attention\nin the SR community, existing methods still adopt techniques from the\nphotorealistic domain. In this paper, we analyze the anime production workflow\nand rethink how to use characteristics of it for the sake of the real-world\nanime SR. First, we argue that video networks and datasets are not necessary\nfor anime SR due to the repetition use of hand-drawing frames. Instead, we\npropose an anime image collection pipeline by choosing the least compressed and\nthe most informative frames from the video sources. Based on this pipeline, we\nintroduce the Anime Production-oriented Image (API) dataset. In addition, we\nidentify two anime-specific challenges of distorted and faint hand-drawn lines\nand unwanted color artifacts. We address the first issue by introducing a\nprediction-oriented compression module in the image degradation model and a\npseudo-ground truth preparation with enhanced hand-drawn lines. In addition, we\nintroduce the balanced twin perceptual loss combining both anime and\nphotorealistic high-level features to mitigate unwanted color artifacts and\nincrease visual clarity. We evaluate our method through extensive experiments\non the public benchmark, showing our method outperforms state-of-the-art anime\ndataset-trained approaches.\n","authors":["Boyang Wang","Fengyu Yang","Xihang Yu","Chao Zhang","Hanbin Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.01598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03541v1","updated":"2024-04-04T15:49:01Z","published":"2024-04-04T15:49:01Z","title":"Segmentation-Guided Knee Radiograph Generation using Conditional\n Diffusion Models","summary":" Deep learning-based medical image processing algorithms require\nrepresentative data during development. In particular, surgical data might be\ndifficult to obtain, and high-quality public datasets are limited. To overcome\nthis limitation and augment datasets, a widely adopted solution is the\ngeneration of synthetic images. In this work, we employ conditional diffusion\nmodels to generate knee radiographs from contour and bone segmentations.\nRemarkably, two distinct strategies are presented by incorporating the\nsegmentation as a condition into the sampling and training process, namely,\nconditional sampling and conditional training. The results demonstrate that\nboth methods can generate realistic images while adhering to the conditioning\nsegmentation. The conditional training method outperforms the conditional\nsampling method and the conventional U-Net.\n","authors":["Siyuan Mei","Fuxin Fan","Fabian Wagner","Mareike Thies","Mingxuan Gu","Yipeng Sun","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2404.03541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03539v1","updated":"2024-04-04T15:47:30Z","published":"2024-04-04T15:47:30Z","title":"Is CLIP the main roadblock for fine-grained open-world perception?","summary":" Modern applications increasingly demand flexible computer vision models that\nadapt to novel concepts not encountered during training. This necessity is\npivotal in emerging domains like extended reality, robotics, and autonomous\ndriving, which require the ability to respond to open-world stimuli. A key\ningredient is the ability to identify objects based on free-form textual\nqueries defined at inference time - a task known as open-vocabulary object\ndetection. Multimodal backbones like CLIP are the main enabling technology for\ncurrent open-world perception solutions. Despite performing well on generic\nqueries, recent studies highlighted limitations on the fine-grained recognition\ncapabilities in open-vocabulary settings - i.e., for distinguishing subtle\nobject features like color, shape, and material. In this paper, we perform a\ndetailed examination of these open-vocabulary object recognition limitations to\nfind the root cause. We evaluate the performance of CLIP, the most commonly\nused vision-language backbone, against a fine-grained object-matching\nbenchmark, revealing interesting analogies between the limitations of\nopen-vocabulary object detectors and their backbones. Experiments suggest that\nthe lack of fine-grained understanding is caused by the poor separability of\nobject characteristics in the CLIP latent space. Therefore, we try to\nunderstand whether fine-grained knowledge is present in CLIP embeddings but not\nexploited at inference time due, for example, to the unsuitability of the\ncosine similarity matching function, which may discard important object\ncharacteristics. Our preliminary experiments show that simple CLIP latent-space\nre-projections help separate fine-grained concepts, paving the way towards the\ndevelopment of backbones inherently able to process fine-grained details. The\ncode for reproducing these experiments is available at\nhttps://github.com/lorebianchi98/FG-CLIP.\n","authors":["Lorenzo Bianchi","Fabio Carrara","Nicola Messina","Fabrizio Falchi"],"pdf_url":"https://arxiv.org/pdf/2404.03539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03537v1","updated":"2024-04-04T15:45:25Z","published":"2024-04-04T15:45:25Z","title":"If It's Not Enough, Make It So: Reducing Authentic Data Demand in Face\n Recognition through Synthetic Faces","summary":" Recent advances in deep face recognition have spurred a growing demand for\nlarge, diverse, and manually annotated face datasets. Acquiring authentic,\nhigh-quality data for face recognition has proven to be a challenge, primarily\ndue to privacy concerns. Large face datasets are primarily sourced from\nweb-based images, lacking explicit user consent. In this paper, we examine\nwhether and how synthetic face data can be used to train effective face\nrecognition models with reduced reliance on authentic images, thereby\nmitigating data collection concerns. First, we explored the performance gap\namong recent state-of-the-art face recognition models, trained with synthetic\ndata only and authentic (scarce) data only. Then, we deepened our analysis by\ntraining a state-of-the-art backbone with various combinations of synthetic and\nauthentic data, gaining insights into optimizing the limited use of the latter\nfor verification accuracy. Finally, we assessed the effectiveness of data\naugmentation approaches on synthetic and authentic data, with the same goal in\nmind. Our results highlighted the effectiveness of FR trained on combined\ndatasets, particularly when combined with appropriate augmentation techniques.\n","authors":["Andrea Atzori","Fadi Boutros","Naser Damer","Gianni Fenu","Mirko Marras"],"pdf_url":"https://arxiv.org/pdf/2404.03537v1.pdf","comment":"Accepted as a full paper at FG 2024 main track"},{"id":"http://arxiv.org/abs/2404.03531v1","updated":"2024-04-04T15:35:43Z","published":"2024-04-04T15:35:43Z","title":"COMO: Compact Mapping and Odometry","summary":" We present COMO, a real-time monocular mapping and odometry system that\nencodes dense geometry via a compact set of 3D anchor points. Decoding anchor\npoint projections into dense geometry via per-keyframe depth covariance\nfunctions guarantees that depth maps are joined together at visible anchor\npoints. The representation enables joint optimization of camera poses and dense\ngeometry, intrinsic 3D consistency, and efficient second-order inference. To\nmaintain a compact yet expressive map, we introduce a frontend that leverages\nthe covariance function for tracking and initializing potentially visually\nindistinct 3D points across frames. Altogether, we introduce a real-time system\ncapable of estimating accurate poses and consistent geometry.\n","authors":["Eric Dexheimer","Andrew J. Davison"],"pdf_url":"https://arxiv.org/pdf/2404.03531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03527v1","updated":"2024-04-04T15:31:11Z","published":"2024-04-04T15:31:11Z","title":"HAPNet: Toward Superior RGB-Thermal Scene Parsing via Hybrid,\n Asymmetric, and Progressive Heterogeneous Feature Fusion","summary":" Data-fusion networks have shown significant promise for RGB-thermal scene\nparsing. However, the majority of existing studies have relied on symmetric\nduplex encoders for heterogeneous feature extraction and fusion, paying\ninadequate attention to the inherent differences between RGB and thermal\nmodalities. Recent progress in vision foundation models (VFMs) trained through\nself-supervision on vast amounts of unlabeled data has proven their ability to\nextract informative, general-purpose features. However, this potential has yet\nto be fully leveraged in the domain. In this study, we take one step toward\nthis new research area by exploring a feasible strategy to fully exploit VFM\nfeatures for RGB-thermal scene parsing. Specifically, we delve deeper into the\nunique characteristics of RGB and thermal modalities, thereby designing a\nhybrid, asymmetric encoder that incorporates both a VFM and a convolutional\nneural network. This design allows for more effective extraction of\ncomplementary heterogeneous features, which are subsequently fused in a\ndual-path, progressive manner. Moreover, we introduce an auxiliary task to\nfurther enrich the local semantics of the fused features, thereby improving the\noverall performance of RGB-thermal scene parsing. Our proposed HAPNet, equipped\nwith all these components, demonstrates superior performance compared to all\nother state-of-the-art RGB-thermal scene parsing networks, achieving top ranks\nacross three widely used public RGB-thermal scene parsing datasets. We believe\nthis new paradigm has opened up new opportunities for future developments in\ndata-fusion scene parsing approaches.\n","authors":["Jiahang Li","Peng Yun","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2404.03527v1.pdf","comment":"12 pages, 4figures"},{"id":"http://arxiv.org/abs/2404.03518v1","updated":"2024-04-04T15:23:14Z","published":"2024-04-04T15:23:14Z","title":"SDPose: Tokenized Pose Estimation via Circulation-Guide\n Self-Distillation","summary":" Recently, transformer-based methods have achieved state-of-the-art prediction\nquality on human pose estimation(HPE). Nonetheless, most of these\ntop-performing transformer-based models are too computation-consuming and\nstorage-demanding to deploy on edge computing platforms. Those\ntransformer-based models that require fewer resources are prone to\nunder-fitting due to their smaller scale and thus perform notably worse than\ntheir larger counterparts. Given this conundrum, we introduce SDPose, a new\nself-distillation method for improving the performance of small\ntransformer-based models. To mitigate the problem of under-fitting, we design a\ntransformer module named Multi-Cycled Transformer(MCT) based on multiple-cycled\nforwards to more fully exploit the potential of small model parameters.\nFurther, in order to prevent the additional inference compute-consuming brought\nby MCT, we introduce a self-distillation scheme, extracting the knowledge from\nthe MCT module to a naive forward model. Specifically, on the MSCOCO validation\ndataset, SDPose-T obtains 69.7% mAP with 4.4M parameters and 1.8 GFLOPs.\nFurthermore, SDPose-S-V2 obtains 73.5% mAP on the MSCOCO validation dataset\nwith 6.2M parameters and 4.7 GFLOPs, achieving a new state-of-the-art among\npredominant tiny neural network methods. Our code is available at\nhttps://github.com/MartyrPenink/SDPose.\n","authors":["Sichen Chen","Yingyi Zhang","Siming Huang","Ran Yi","Ke Fan","Ruixin Zhang","Peixian Chen","Jun Wang","Shouhong Ding","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2404.03518v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03507v1","updated":"2024-04-04T15:10:24Z","published":"2024-04-04T15:10:24Z","title":"DQ-DETR: DETR with Dynamic Query for Tiny Object Detection","summary":" Despite previous DETR-like methods having performed successfully in generic\nobject detection, tiny object detection is still a challenging task for them\nsince the positional information of object queries is not customized for\ndetecting tiny objects, whose scale is extraordinarily smaller than general\nobjects. Also, DETR-like methods using a fixed number of queries make them\nunsuitable for aerial datasets, which only contain tiny objects, and the\nnumbers of instances are imbalanced between different images. Thus, we present\na simple yet effective model, named DQ-DETR, which consists of three different\ncomponents: categorical counting module, counting-guided feature enhancement,\nand dynamic query selection to solve the above-mentioned problems. DQ-DETR uses\nthe prediction and density maps from the categorical counting module to\ndynamically adjust the number of object queries and improve the positional\ninformation of queries. Our model DQ-DETR outperforms previous CNN-based and\nDETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2\ndataset, which mostly consists of tiny objects.\n","authors":["Yi-Xin Huang","Hou-I Liu","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11963v2","updated":"2024-04-04T15:10:23Z","published":"2023-03-21T15:50:08Z","title":"NEMTO: Neural Environment Matting for Novel View and Relighting\n Synthesis of Transparent Objects","summary":" We propose NEMTO, the first end-to-end neural rendering pipeline to model 3D\ntransparent objects with complex geometry and unknown indices of refraction.\nCommonly used appearance modeling such as the Disney BSDF model cannot\naccurately address this challenging problem due to the complex light paths\nbending through refractions and the strong dependency of surface appearance on\nillumination. With 2D images of the transparent object as input, our method is\ncapable of high-quality novel view and relighting synthesis. We leverage\nimplicit Signed Distance Functions (SDF) to model the object geometry and\npropose a refraction-aware ray bending network to model the effects of light\nrefraction within the object. Our ray bending network is more tolerant to\ngeometric inaccuracies than traditional physically-based methods for rendering\ntransparent objects. We provide extensive evaluations on both synthetic and\nreal-world datasets to demonstrate our high-quality synthesis and the\napplicability of our method.\n","authors":["Dongqing Wang","Tong Zhang","Sabine Süsstrunk"],"pdf_url":"https://arxiv.org/pdf/2303.11963v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2312.09228v3","updated":"2024-04-04T15:06:02Z","published":"2023-12-14T18:54:32Z","title":"3DGS-Avatar: Animatable Avatars via Deformable 3D Gaussian Splatting","summary":" We introduce an approach that creates animatable human avatars from monocular\nvideos using 3D Gaussian Splatting (3DGS). Existing methods based on neural\nradiance fields (NeRFs) achieve high-quality novel-view/novel-pose image\nsynthesis but often require days of training, and are extremely slow at\ninference time. Recently, the community has explored fast grid structures for\nefficient training of clothed avatars. Albeit being extremely fast at training,\nthese methods can barely achieve an interactive rendering frame rate with\naround 15 FPS. In this paper, we use 3D Gaussian Splatting and learn a\nnon-rigid deformation network to reconstruct animatable clothed human avatars\nthat can be trained within 30 minutes and rendered at real-time frame rates\n(50+ FPS). Given the explicit nature of our representation, we further\nintroduce as-isometric-as-possible regularizations on both the Gaussian mean\nvectors and the covariance matrices, enhancing the generalization of our model\non highly articulated unseen poses. Experimental results show that our method\nachieves comparable and even better performance compared to state-of-the-art\napproaches on animatable avatar creation from a monocular input, while being\n400x and 250x faster in training and inference, respectively.\n","authors":["Zhiyin Qian","Shaofei Wang","Marko Mihajlovic","Andreas Geiger","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2312.09228v3.pdf","comment":"Project page: https://neuralbodies.github.io/3DGS-Avatar"},{"id":"http://arxiv.org/abs/2403.19612v2","updated":"2024-04-04T14:44:23Z","published":"2024-03-28T17:32:01Z","title":"ILPO-NET: Network for the invariant recognition of arbitrary volumetric\n patterns in 3D","summary":" Effective recognition of spatial patterns and learning their hierarchy is\ncrucial in modern spatial data analysis. Volumetric data applications seek\ntechniques ensuring invariance not only to shifts but also to pattern\nrotations. While traditional methods can readily achieve translational\ninvariance, rotational invariance possesses multiple challenges and remains an\nactive area of research. Here, we present ILPO-Net (Invariant to Local Patterns\nOrientation Network), a novel approach that handles arbitrarily shaped patterns\nwith the convolutional operation inherently invariant to local spatial pattern\norientations using the Wigner matrix expansions. Our architecture seamlessly\nintegrates the new convolution operator and, when benchmarked on diverse\nvolumetric datasets such as MedMNIST and CATH, demonstrates superior\nperformance over the baselines with significantly reduced parameter counts - up\nto 1000 times fewer in the case of MedMNIST. Beyond these demonstrations,\nILPO-Net's rotational invariance paves the way for other applications across\nmultiple disciplines. Our code is publicly available at\nhttps://gricad-gitlab.univ-grenoble-alpes.fr/GruLab/ILPONet.\n","authors":["Dmitrii Zhemchuzhnikov","Sergei Grudinin"],"pdf_url":"https://arxiv.org/pdf/2403.19612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01987v2","updated":"2024-04-04T14:40:21Z","published":"2023-12-04T16:04:41Z","title":"Bootstrapping SparseFormers from Vision Foundation Models","summary":" The recently proposed SparseFormer architecture provides an alternative\napproach to visual understanding by utilizing a significantly lower number of\nvisual tokens via adjusting RoIs, greatly reducing computational costs while\nstill achieving promising performance. However, training SparseFormers from\nscratch is still expensive, and scaling up the number of parameters can be\nchallenging. In this paper, we propose to bootstrap SparseFormers from\nViT-based vision foundation models in a simple and efficient way. Since the\nmajority of SparseFormer blocks are the standard transformer ones, we can\ninherit weights from large-scale pre-trained vision transformers and freeze\nthem as much as possible. Therefore, we only need to train the\nSparseFormer-specific lightweight focusing transformer to adjust token RoIs and\nfine-tune a few early pre-trained blocks to align the final token\nrepresentation. In such a way, we can bootstrap SparseFormer architectures from\nvarious large-scale pre-trained models (e.g., IN-21K pre-trained AugRegs or\nCLIPs) using a rather smaller amount of training samples (e.g., IN-1K) and\nwithout labels or captions within just a few hours. As a result, the\nbootstrapped unimodal SparseFormer (from AugReg-ViT-L/16-384) can reach 84.9%\naccuracy on IN-1K with only 49 tokens, and the multimodal SparseFormer from\nCLIPs also demonstrates notable zero-shot performance with highly reduced\ncomputational cost without seeing any caption during the bootstrapping\nprocedure. In addition, CLIP-bootstrapped SparseFormers, which align the output\nspace with language without seeing a word, can serve as efficient vision\nencoders in multimodal large language models. Code and models are available at\nhttps://github.com/showlab/sparseformer\n","authors":["Ziteng Gao","Zhan Tong","Kevin Qinghong Lin","Joya Chen","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.01987v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03482v1","updated":"2024-04-04T14:35:49Z","published":"2024-04-04T14:35:49Z","title":"AdaGlimpse: Active Visual Exploration with Arbitrary Glimpse Position\n and Scale","summary":" Active Visual Exploration (AVE) is a task that involves dynamically selecting\nobservations (glimpses), which is critical to facilitate comprehension and\nnavigation within an environment. While modern AVE methods have demonstrated\nimpressive performance, they are constrained to fixed-scale glimpses from rigid\ngrids. In contrast, existing mobile platforms equipped with optical zoom\ncapabilities can capture glimpses of arbitrary positions and scales. To address\nthis gap between software and hardware capabilities, we introduce AdaGlimpse.\nIt uses Soft Actor-Critic, a reinforcement learning algorithm tailored for\nexploration tasks, to select glimpses of arbitrary position and scale. This\napproach enables our model to rapidly establish a general awareness of the\nenvironment before zooming in for detailed analysis. Experimental results\ndemonstrate that AdaGlimpse surpasses previous methods across various visual\ntasks while maintaining greater applicability in realistic AVE scenarios.\n","authors":["Adam Pardyl","Michał Wronka","Maciej Wołczyk","Kamil Adamczewski","Tomasz Trzciński","Bartosz Zieliński"],"pdf_url":"https://arxiv.org/pdf/2404.03482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03477v1","updated":"2024-04-04T14:28:34Z","published":"2024-04-04T14:28:34Z","title":"Towards Automated Movie Trailer Generation","summary":" Movie trailers are an essential tool for promoting films and attracting\naudiences. However, the process of creating trailers can be time-consuming and\nexpensive. To streamline this process, we propose an automatic trailer\ngeneration framework that generates plausible trailers from a full movie by\nautomating shot selection and composition. Our approach draws inspiration from\nmachine translation techniques and models the movies and trailers as sequences\nof shots, thus formulating the trailer generation problem as a\nsequence-to-sequence task. We introduce Trailer Generation Transformer (TGT), a\ndeep-learning framework utilizing an encoder-decoder architecture. TGT movie\nencoder is tasked with contextualizing each movie shot representation via\nself-attention, while the autoregressive trailer decoder predicts the feature\nrepresentation of the next trailer shot, accounting for the relevance of shots'\ntemporal order in trailers. Our TGT significantly outperforms previous methods\non a comprehensive suite of metrics.\n","authors":["Dawit Mureja Argaw","Mattia Soldan","Alejandro Pardo","Chen Zhao","Fabian Caba Heilbron","Joon Son Chung","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2404.03477v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03474v1","updated":"2024-04-04T14:26:58Z","published":"2024-04-04T14:26:58Z","title":"Performance of computer vision algorithms for fine-grained\n classification using crowdsourced insect images","summary":" With fine-grained classification, we identify unique characteristics to\ndistinguish among classes of the same super-class. We are focusing on species\nrecognition in Insecta, as they are critical for biodiversity monitoring and at\nthe base of many ecosystems. With citizen science campaigns, billions of images\nare collected in the wild. Once these are labelled, experts can use them to\ncreate distribution maps. However, the labelling process is time-consuming,\nwhich is where computer vision comes in. The field of computer vision offers a\nwide range of algorithms, each with its strengths and weaknesses; how do we\nidentify the algorithm that is in line with our application? To answer this\nquestion, we provide a full and detailed evaluation of nine algorithms among\ndeep convolutional networks (CNN), vision transformers (ViT), and\nlocality-based vision transformers (LBVT) on 4 different aspects:\nclassification performance, embedding quality, computational cost, and gradient\nactivity. We offer insights that we haven't yet had in this domain proving to\nwhich extent these algorithms solve the fine-grained tasks in Insecta. We found\nthat the ViT performs the best on inference speed and computational cost while\nthe LBVT outperforms the others on performance and embedding quality; the CNN\nprovide a trade-off among the metrics.\n","authors":["Rita Pucci","Vincent J. Kalkman","Dan Stowell"],"pdf_url":"https://arxiv.org/pdf/2404.03474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03462v1","updated":"2024-04-04T14:13:56Z","published":"2024-04-04T14:13:56Z","title":"You Only Scan Once: A Dynamic Scene Reconstruction Pipeline for 6-DoF\n Robotic Grasping of Novel Objects","summary":" In the realm of robotic grasping, achieving accurate and reliable\ninteractions with the environment is a pivotal challenge. Traditional methods\nof grasp planning methods utilizing partial point clouds derived from depth\nimage often suffer from reduced scene understanding due to occlusion,\nultimately impeding their grasping accuracy. Furthermore, scene reconstruction\nmethods have primarily relied upon static techniques, which are susceptible to\nenvironment change during manipulation process limits their efficacy in\nreal-time grasping tasks. To address these limitations, this paper introduces a\nnovel two-stage pipeline for dynamic scene reconstruction. In the first stage,\nour approach takes scene scanning as input to register each target object with\nmesh reconstruction and novel object pose tracking. In the second stage, pose\ntracking is still performed to provide object poses in real-time, enabling our\napproach to transform the reconstructed object point clouds back into the\nscene. Unlike conventional methodologies, which rely on static scene snapshots,\nour method continuously captures the evolving scene geometry, resulting in a\ncomprehensive and up-to-date point cloud representation. By circumventing the\nconstraints posed by occlusion, our method enhances the overall grasp planning\nprocess and empowers state-of-the-art 6-DoF robotic grasping algorithms to\nexhibit markedly improved accuracy.\n","authors":["Lei Zhou","Haozhe Wang","Zhengshen Zhang","Zhiyang Liu","Francis EH Tay","adn Marcelo H. Ang. Jr"],"pdf_url":"https://arxiv.org/pdf/2404.03462v1.pdf","comment":"ICRA 2024"},{"id":"http://arxiv.org/abs/2404.03451v1","updated":"2024-04-04T13:55:06Z","published":"2024-04-04T13:55:06Z","title":"How Much Data are Enough? Investigating Dataset Requirements for\n Patch-Based Brain MRI Segmentation Tasks","summary":" Training deep neural networks reliably requires access to large-scale\ndatasets. However, obtaining such datasets can be challenging, especially in\nthe context of neuroimaging analysis tasks, where the cost associated with\nimage acquisition and annotation can be prohibitive. To mitigate both the time\nand financial costs associated with model development, a clear understanding of\nthe amount of data required to train a satisfactory model is crucial. This\npaper focuses on an early stage phase of deep learning research, prior to model\ndevelopment, and proposes a strategic framework for estimating the amount of\nannotated data required to train patch-based segmentation networks. This\nframework includes the establishment of performance expectations using a novel\nMinor Boundary Adjustment for Threshold (MinBAT) method, and standardizing\npatch selection through the ROI-based Expanded Patch Selection (REPS) method.\nOur experiments demonstrate that tasks involving regions of interest (ROIs)\nwith different sizes or shapes may yield variably acceptable Dice Similarity\nCoefficient (DSC) scores. By setting an acceptable DSC as the target, the\nrequired amount of training data can be estimated and even predicted as data\naccumulates. This approach could assist researchers and engineers in estimating\nthe cost associated with data collection and annotation when defining a new\nsegmentation task based on deep neural networks, ultimately contributing to\ntheir efficient translation to real-world applications.\n","authors":["Dongang Wang","Peilin Liu","Hengrui Wang","Heidi Beadnall","Kain Kyle","Linda Ly","Mariano Cabezas","Geng Zhan","Ryan Sullivan","Weidong Cai","Wanli Ouyang","Fernando Calamante","Michael Barnett","Chenyu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.03451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13785v2","updated":"2024-04-04T13:52:17Z","published":"2024-01-24T20:06:59Z","title":"Unified Spatio-Temporal Tri-Perspective View Representation for 3D\n Semantic Occupancy Prediction","summary":" Holistic understanding and reasoning in 3D scenes play a vital role in the\nsuccess of autonomous driving systems. The evolution of 3D semantic occupancy\nprediction as a pretraining task for autonomous driving and robotic downstream\ntasks capture finer 3D details compared to methods like 3D detection. Existing\napproaches predominantly focus on spatial cues such as tri-perspective view\nembeddings (TPV), often overlooking temporal cues. This study introduces a\nspatiotemporal transformer architecture S2TPVFormer for temporally coherent 3D\nsemantic occupancy prediction. We enrich the prior process by including\ntemporal cues using a novel temporal cross-view hybrid attention mechanism\n(TCVHA) and generate spatiotemporal TPV embeddings (i.e. S2TPV embeddings).\nExperimental evaluations on the nuScenes dataset demonstrate a substantial 4.1%\nimprovement in mean Intersection over Union (mIoU) for 3D Semantic Occupancy\ncompared to TPVFormer, confirming the effectiveness of the proposed S2TPVFormer\nin enhancing 3D scene perception.\n","authors":["Sathira Silva","Savindu Bhashitha Wannigama","Gihan Jayatilaka","Muhammad Haris Khan","Roshan Ragel"],"pdf_url":"https://arxiv.org/pdf/2401.13785v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03446v1","updated":"2024-04-04T13:46:52Z","published":"2024-04-04T13:46:52Z","title":"SP$^2$OT: Semantic-Regularized Progressive Partial Optimal Transport for\n Imbalanced Clustering","summary":" Deep clustering, which learns representation and semantic clustering without\nlabels information, poses a great challenge for deep learning-based approaches.\nDespite significant progress in recent years, most existing methods focus on\nuniformly distributed datasets, significantly limiting the practical\napplicability of their methods. In this paper, we propose a more practical\nproblem setting named deep imbalanced clustering, where the underlying classes\nexhibit an imbalance distribution. To address this challenge, we introduce a\nnovel optimal transport-based pseudo-label learning framework. Our framework\nformulates pseudo-label generation as a Semantic-regularized Progressive\nPartial Optimal Transport (SP$^2$OT) problem, which progressively transports\neach sample to imbalanced clusters under several prior distribution and\nsemantic relation constraints, thus generating high-quality and imbalance-aware\npseudo-labels. To solve SP$^2$OT, we develop a Majorization-Minimization-based\noptimization algorithm. To be more precise, we employ the strategy of\nmajorization to reformulate the SP$^2$OT problem into a Progressive Partial\nOptimal Transport problem, which can be transformed into an unbalanced optimal\ntransport problem with augmented constraints and can be solved efficiently by a\nfast matrix scaling algorithm. Experiments on various datasets, including a\nhuman-curated long-tailed CIFAR100, challenging ImageNet-R, and large-scale\nsubsets of fine-grained iNaturalist2018 datasets, demonstrate the superiority\nof our method.\n","authors":["Chuyu Zhang","Hui Ren","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2404.03446v1.pdf","comment":"under review. arXiv admin note: substantial text overlap with\n arXiv:2401.09266"},{"id":"http://arxiv.org/abs/2404.03443v1","updated":"2024-04-04T13:43:11Z","published":"2024-04-04T13:43:11Z","title":"Part-Attention Based Model Make Occluded Person Re-Identification\n Stronger","summary":" The goal of occluded person re-identification (ReID) is to retrieve specific\npedestrians in occluded situations. However, occluded person ReID still suffers\nfrom background clutter and low-quality local feature representations, which\nlimits model performance. In our research, we introduce a new framework called\nPAB-ReID, which is a novel ReID model incorporating part-attention mechanisms\nto tackle the aforementioned issues effectively. Firstly, we introduce the\nhuman parsing label to guide the generation of more accurate human part\nattention maps. In addition, we propose a fine-grained feature focuser for\ngenerating fine-grained human local feature representations while suppressing\nbackground interference. Moreover, We also design a part triplet loss to\nsupervise the learning of human local features, which optimizes\nintra/inter-class distance. We conducted extensive experiments on specialized\nocclusion and regular ReID datasets, showcasing that our approach outperforms\nthe existing state-of-the-art methods.\n","authors":["Zhihao Chen","Yiyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2404.03443v1.pdf","comment":"Accepted By International Joint Conference on Neural Networks"},{"id":"http://arxiv.org/abs/2312.12080v2","updated":"2024-04-04T13:36:21Z","published":"2023-12-19T11:57:54Z","title":"Learning Subject-Aware Cropping by Outpainting Professional Photos","summary":" How to frame (or crop) a photo often depends on the image subject and its\ncontext; e.g., a human portrait. Recent works have defined the subject-aware\nimage cropping task as a nuanced and practical version of image cropping. We\npropose a weakly-supervised approach (GenCrop) to learn what makes a\nhigh-quality, subject-aware crop from professional stock images. Unlike\nsupervised prior work, GenCrop requires no new manual annotations beyond the\nexisting stock image collection. The key challenge in learning from this data,\nhowever, is that the images are already cropped and we do not know what regions\nwere removed. Our insight is to combine a library of stock images with a\nmodern, pre-trained text-to-image diffusion model. The stock image collection\nprovides diversity and its images serve as pseudo-labels for a good crop, while\nthe text-image diffusion model is used to out-paint (i.e., outward inpainting)\nrealistic uncropped images. Using this procedure, we are able to automatically\ngenerate a large dataset of cropped-uncropped training pairs to train a\ncropping model. Despite being weakly-supervised, GenCrop is competitive with\nstate-of-the-art supervised methods and significantly better than comparable\nweakly-supervised baselines on quantitative and qualitative evaluation metrics.\n","authors":["James Hong","Lu Yuan","Michaël Gharbi","Matthew Fisher","Kayvon Fatahalian"],"pdf_url":"https://arxiv.org/pdf/2312.12080v2.pdf","comment":"AAAI 24. Extended version with supplemental materials"},{"id":"http://arxiv.org/abs/2404.02656v2","updated":"2024-04-04T13:30:59Z","published":"2024-04-03T11:37:03Z","title":"Non-negative Subspace Feature Representation for Few-shot Learning in\n Medical Imaging","summary":" Unlike typical visual scene recognition domains, in which massive datasets\nare accessible to deep neural networks, medical image interpretations are often\nobstructed by the paucity of data. In this paper, we investigate the\neffectiveness of data-based few-shot learning in medical imaging by exploring\ndifferent data attribute representations in a low-dimensional space. We\nintroduce different types of non-negative matrix factorization (NMF) in\nfew-shot learning, addressing the data scarcity issue in medical image\nclassification. Extensive empirical studies are conducted in terms of\nvalidating the effectiveness of NMF, especially its supervised variants (e.g.,\ndiscriminative NMF, and supervised and constrained NMF with sparseness), and\nthe comparison with principal component analysis (PCA), i.e., the collaborative\nrepresentation-based dimensionality reduction technique derived from\neigenvectors. With 14 different datasets covering 11 distinct illness\ncategories, thorough experimental results and comparison with related\ntechniques demonstrate that NMF is a competitive alternative to PCA for\nfew-shot learning in medical imaging, and the supervised NMF algorithms are\nmore discriminative in the subspace with greater effectiveness. Furthermore, we\nshow that the part-based representation of NMF, especially its supervised\nvariants, is dramatically impactful in detecting lesion areas in medical\nimaging with limited samples.\n","authors":["Keqiang Fan","Xiaohao Cai","Mahesan Niranjan"],"pdf_url":"https://arxiv.org/pdf/2404.02656v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14162v3","updated":"2024-04-04T13:29:25Z","published":"2023-09-25T14:13:26Z","title":"Data Upcycling Knowledge Distillation for Image Super-Resolution","summary":" Knowledge distillation (KD) compresses deep neural networks by transferring\ntask-related knowledge from cumbersome pre-trained teacher models to compact\nstudent models. However, current KD methods for super-resolution (SR) networks\noverlook the nature of SR task that the outputs of the teacher model are noisy\napproximations to the ground-truth distribution of high-quality images (GT),\nwhich shades the teacher model's knowledge to result in limited KD effects. To\nutilize the teacher model beyond the GT upper-bound, we present the Data\nUpcycling Knowledge Distillation (DUKD), to transfer the teacher model's\nknowledge to the student model through the upcycled in-domain data derived from\ntraining data. Besides, we impose label consistency regularization to KD for SR\nby the paired invertible augmentations to improve the student model's\nperformance and robustness. Comprehensive experiments demonstrate that the DUKD\nmethod significantly outperforms previous arts on several SR tasks.\n","authors":["Yun Zhang","Wei Li","Simiao Li","Hanting Chen","Zhijun Tu","Wenjia Wang","Bingyi Jing","Shaohui Lin","Jie Hu"],"pdf_url":"https://arxiv.org/pdf/2309.14162v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03425v1","updated":"2024-04-04T13:06:25Z","published":"2024-04-04T13:06:25Z","title":"ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State\n Space Model","summary":" Convolutional neural networks (CNN) and Transformers have made impressive\nprogress in the field of remote sensing change detection (CD). However, both\narchitectures have their inherent shortcomings. Recently, the Mamba\narchitecture, based on spatial state models, has shown remarkable performance\nin a series of natural language processing tasks, which can effectively\ncompensate for the shortcomings of the above two architectures. In this paper,\nwe explore for the first time the potential of the Mamba architecture for\nremote sensing change detection tasks. We tailor the corresponding frameworks,\ncalled MambaBCD, MambaSCD, and MambaBDA, for binary change detection (BCD),\nsemantic change detection (SCD), and building damage assessment (BDA),\nrespectively. All three frameworks adopt the cutting-edge visual Mamba\narchitecture as the encoder, which allows full learning of global spatial\ncontextual information from the input images. For the change decoder, which is\navailable in all three architectures, we propose three spatio-temporal\nrelationship modeling mechanisms, which can be naturally combined with the\nMamba architecture and fully utilize its attribute to achieve spatio-temporal\ninteraction of multi-temporal features and obtain accurate change information.\nOn five benchmark datasets, our proposed frameworks outperform current CNN- and\nTransformer-based approaches without using any complex strategies or tricks,\nfully demonstrating the potential of the Mamba architecture. Specifically, we\nobtained 83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU,\nLEVIR-CD+, and WHU-CD; on the SCD dataset SECOND, we obtained 24.04% SeK; and\non the xBD dataset, we obtained 81.41% overall F1 score. The source code will\nbe available in https://github.com/ChenHongruixuan/MambaCD\n","authors":["Hongruixuan Chen","Jian Song","Chengxi Han","Junshi Xia","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2404.03425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00778v2","updated":"2024-04-04T13:00:20Z","published":"2023-12-01T18:55:53Z","title":"MorpheuS: Neural Dynamic 360° Surface Reconstruction from Monocular\n RGB-D Video","summary":" Neural rendering has demonstrated remarkable success in dynamic scene\nreconstruction. Thanks to the expressiveness of neural representations, prior\nworks can accurately capture the motion and achieve high-fidelity\nreconstruction of the target object. Despite this, real-world video scenarios\noften feature large unobserved regions where neural representations struggle to\nachieve realistic completion. To tackle this challenge, we introduce MorpheuS,\na framework for dynamic 360{\\deg} surface reconstruction from a casually\ncaptured RGB-D video. Our approach models the target scene as a canonical field\nthat encodes its geometry and appearance, in conjunction with a deformation\nfield that warps points from the current frame to the canonical space. We\nleverage a view-dependent diffusion prior and distill knowledge from it to\nachieve realistic completion of unobserved regions. Experimental results on\nvarious real-world and synthetic datasets show that our method can achieve\nhigh-fidelity 360{\\deg} surface reconstruction of a deformable object from a\nmonocular RGB-D video.\n","authors":["Hengyi Wang","Jingwen Wang","Lourdes Agapito"],"pdf_url":"https://arxiv.org/pdf/2312.00778v2.pdf","comment":"CVPR2024. Project page:\n https://hengyiwang.github.io/projects/morpheus"},{"id":"http://arxiv.org/abs/2404.03421v1","updated":"2024-04-04T12:58:46Z","published":"2024-04-04T12:58:46Z","title":"Generalizable 3D Scene Reconstruction via Divide and Conquer from a\n Single View","summary":" Single-view 3D reconstruction is currently approached from two dominant\nperspectives: reconstruction of scenes with limited diversity using 3D data\nsupervision or reconstruction of diverse singular objects using large image\npriors. However, real-world scenarios are far more complex and exceed the\ncapabilities of these methods. We therefore propose a hybrid method following a\ndivide-and-conquer strategy. We first process the scene holistically,\nextracting depth and semantic information, and then leverage a single-shot\nobject-level method for the detailed reconstruction of individual components.\nBy following a compositional processing approach, the overall framework\nachieves full reconstruction of complex 3D scenes from a single image. We\npurposely design our pipeline to be highly modular by carefully integrating\nspecific procedures for each processing step, without requiring an end-to-end\ntraining of the whole system. This enables the pipeline to naturally improve as\nfuture methods can replace the individual modules. We demonstrate the\nreconstruction performance of our approach on both synthetic and real-world\nscenes, comparing favorable against prior works. Project page:\nhttps://andreeadogaru.github.io/Gen3DSR.\n","authors":["Andreea Dogaru","Mert Özer","Bernhard Egger"],"pdf_url":"https://arxiv.org/pdf/2404.03421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03417v1","updated":"2024-04-04T12:50:51Z","published":"2024-04-04T12:50:51Z","title":"NMF-Based Analysis of Mobile Eye-Tracking Data","summary":" The depiction of scanpaths from mobile eye-tracking recordings by thumbnails\nfrom the stimulus allows the application of visual computing to detect areas of\ninterest in an unsupervised way. We suggest using nonnegative matrix\nfactorization (NMF) to identify such areas in stimuli. For a user-defined\ninteger k, NMF produces an explainable decomposition into k components, each\nconsisting of a spatial representation associated with a temporal indicator. In\nthe context of multiple eye-tracking recordings, this leads to k spatial\nrepresentations, where the temporal indicator highlights the appearance within\nrecordings. The choice of k provides an opportunity to control the refinement\nof the decomposition, i.e., the number of areas to detect. We combine our\nNMF-based approach with visualization techniques to enable an exploratory\nanalysis of multiple recordings. Finally, we demonstrate the usefulness of our\napproach with mobile eye-tracking data of an art gallery.\n","authors":["Daniel Klötzl","Tim Krake","Frank Heyen","Michael Becher","Maurice Koch","Daniel Weiskopf","Kuno Kurzhals"],"pdf_url":"https://arxiv.org/pdf/2404.03417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03415v1","updated":"2024-04-04T12:49:42Z","published":"2024-04-04T12:49:42Z","title":"Future Predictive Success-or-Failure Classification for Long-Horizon\n Robotic Tasks","summary":" Automating long-horizon tasks with a robotic arm has been a central research\ntopic in robotics. Optimization-based action planning is an efficient approach\nfor creating an action plan to complete a given task. Construction of a\nreliable planning method requires a design process of conditions, e.g., to\navoid collision between objects. The design process, however, has two critical\nissues: 1) iterative trials--the design process is time-consuming due to the\ntrial-and-error process of modifying conditions, and 2) manual redesign--it is\ndifficult to cover all the necessary conditions manually. To tackle these\nissues, this paper proposes a future-predictive\nsuccess-or-failure-classification method to obtain conditions automatically.\nThe key idea behind the proposed method is an end-to-end approach for\ndetermining whether the action plan can complete a given task instead of\nmanually redesigning the conditions. The proposed method uses a long-horizon\nfuture-prediction method to enable success-or-failure classification without\nthe execution of an action plan. This paper also proposes a regularization term\ncalled transition consistency regularization to provide easy-to-predict feature\ndistribution. The regularization term improves future prediction and\nclassification performance. The effectiveness of our method is demonstrated\nthrough classification and robotic-manipulation experiments.\n","authors":["Naoya Sogi","Hiroyuki Oyama","Takashi Shibata","Makoto Terao"],"pdf_url":"https://arxiv.org/pdf/2404.03415v1.pdf","comment":"IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.03413v1","updated":"2024-04-04T12:46:01Z","published":"2024-04-04T12:46:01Z","title":"MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding with\n Interleaved Visual-Textual Tokens","summary":" This paper introduces MiniGPT4-Video, a multimodal Large Language Model (LLM)\ndesigned specifically for video understanding. The model is capable of\nprocessing both temporal visual and textual data, making it adept at\nunderstanding the complexities of videos. Building upon the success of\nMiniGPT-v2, which excelled in translating visual features into the LLM space\nfor single images and achieved impressive results on various image-text\nbenchmarks, this paper extends the model's capabilities to process a sequence\nof frames, enabling it to comprehend videos. MiniGPT4-video does not only\nconsider visual content but also incorporates textual conversations, allowing\nthe model to effectively answer queries involving both visual and text\ncomponents. The proposed model outperforms existing state-of-the-art methods,\nregistering gains of 4.22%, 1.13%, 20.82%, and 13.1% on the MSVD, MSRVTT, TGIF,\nand TVQA benchmarks respectively. Our models and code have been made publicly\navailable here https://vision-cair.github.io/MiniGPT4-video/\n","authors":["Kirolos Ataallah","Xiaoqian Shen","Eslam Abdelrahman","Essam Sleiman","Deyao Zhu","Jian Ding","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2404.03413v1.pdf","comment":"6 pages,8 figures"},{"id":"http://arxiv.org/abs/2403.16612v2","updated":"2024-04-04T12:35:33Z","published":"2024-03-25T10:42:48Z","title":"Calibrating Bayesian UNet++ for Sub-Seasonal Forecasting","summary":" Seasonal forecasting is a crucial task when it comes to detecting the extreme\nheat and colds that occur due to climate change. Confidence in the predictions\nshould be reliable since a small increase in the temperatures in a year has a\nbig impact on the world. Calibration of the neural networks provides a way to\nensure our confidence in the predictions. However, calibrating regression\nmodels is an under-researched topic, especially in forecasters. We calibrate a\nUNet++ based architecture, which was shown to outperform physics-based models\nin temperature anomalies. We show that with a slight trade-off between\nprediction error and calibration error, it is possible to get more reliable and\nsharper forecasts. We believe that calibration should be an important part of\nsafety-critical machine learning applications such as weather forecasters.\n","authors":["Busra Asan","Abdullah Akgül","Alper Unal","Melih Kandemir","Gozde Unal"],"pdf_url":"https://arxiv.org/pdf/2403.16612v2.pdf","comment":"Accepted as a workshop paper at \"ICLR 2024 Tackling Climate Change\n with Machine Learning\""},{"id":"http://arxiv.org/abs/2404.03407v1","updated":"2024-04-04T12:12:24Z","published":"2024-04-04T12:12:24Z","title":"AIGIQA-20K: A Large Database for AI-Generated Image Quality Assessment","summary":" With the rapid advancements in AI-Generated Content (AIGC), AI-Generated\nImages (AIGIs) have been widely applied in entertainment, education, and social\nmedia. However, due to the significant variance in quality among different\nAIGIs, there is an urgent need for models that consistently match human\nsubjective ratings. To address this issue, we organized a challenge towards\nAIGC quality assessment on NTIRE 2024 that extensively considers 15 popular\ngenerative models, utilizing dynamic hyper-parameters (including\nclassifier-free guidance, iteration epochs, and output image resolution), and\ngather subjective scores that consider perceptual quality and text-to-image\nalignment altogether comprehensively involving 21 subjects. This approach\nculminates in the creation of the largest fine-grained AIGI subjective quality\ndatabase to date with 20,000 AIGIs and 420,000 subjective ratings, known as\nAIGIQA-20K. Furthermore, we conduct benchmark experiments on this database to\nassess the correspondence between 16 mainstream AIGI quality models and human\nperception. We anticipate that this large-scale quality database will inspire\nrobust quality indicators for AIGIs and propel the evolution of AIGC for\nvision. The database is released on\nhttps://www.modelscope.cn/datasets/lcysyzxdxc/AIGCQA-30K-Image.\n","authors":["Chunyi Li","Tengchuan Kou","Yixuan Gao","Yuqin Cao","Wei Sun","Zicheng Zhang","Yingjie Zhou","Zhichao Zhang","Weixia Zhang","Haoning Wu","Xiaohong Liu","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2404.03407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03398v1","updated":"2024-04-04T11:59:06Z","published":"2024-04-04T11:59:06Z","title":"Scaling Up Video Summarization Pretraining with Large Language Models","summary":" Long-form video content constitutes a significant portion of internet\ntraffic, making automated video summarization an essential research problem.\nHowever, existing video summarization datasets are notably limited in their\nsize, constraining the effectiveness of state-of-the-art methods for\ngeneralization. Our work aims to overcome this limitation by capitalizing on\nthe abundance of long-form videos with dense speech-to-video alignment and the\nremarkable capabilities of recent large language models (LLMs) in summarizing\nlong text. We introduce an automated and scalable pipeline for generating a\nlarge-scale video summarization dataset using LLMs as Oracle summarizers. By\nleveraging the generated dataset, we analyze the limitations of existing\napproaches and propose a new video summarization model that effectively\naddresses them. To facilitate further research in the field, our work also\npresents a new benchmark dataset that contains 1200 long videos each with\nhigh-quality summaries annotated by professionals. Extensive experiments\nclearly indicate that our proposed approach sets a new state-of-the-art in\nvideo summarization across several benchmarks.\n","authors":["Dawit Mureja Argaw","Seunghyun Yoon","Fabian Caba Heilbron","Hanieh Deilamsalehy","Trung Bui","Zhaowen Wang","Franck Dernoncourt","Joon Son Chung"],"pdf_url":"https://arxiv.org/pdf/2404.03398v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03394v1","updated":"2024-04-04T11:53:37Z","published":"2024-04-04T11:53:37Z","title":"Background Noise Reduction of Attention Map for Weakly Supervised\n Semantic Segmentation","summary":" In weakly-supervised semantic segmentation (WSSS) using only image-level\nclass labels, a problem with CNN-based Class Activation Maps (CAM) is that they\ntend to activate the most discriminative local regions of objects. On the other\nhand, methods based on Transformers learn global features but suffer from the\nissue of background noise contamination. This paper focuses on addressing the\nissue of background noise in attention weights within the existing WSSS method\nbased on Conformer, known as TransCAM. The proposed method successfully reduces\nbackground noise, leading to improved accuracy of pseudo labels. Experimental\nresults demonstrate that our model achieves segmentation performance of 70.5%\non the PASCAL VOC 2012 validation data, 71.1% on the test data, and 45.9% on MS\nCOCO 2014 data, outperforming TransCAM in terms of segmentation performance.\n","authors":["Izumi Fujimori","Masaki Oono","Masami Shishibori"],"pdf_url":"https://arxiv.org/pdf/2404.03394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03392v1","updated":"2024-04-04T11:49:56Z","published":"2024-04-04T11:49:56Z","title":"Two Tricks to Improve Unsupervised Segmentation Learning","summary":" We present two practical improvement techniques for unsupervised segmentation\nlearning. These techniques address limitations in the resolution and accuracy\nof predicted segmentation maps of recent state-of-the-art methods. Firstly, we\nleverage image post-processing techniques such as guided filtering to refine\nthe output masks, improving accuracy while avoiding substantial computational\ncosts. Secondly, we introduce a multi-scale consistency criterion, based on a\nteacher-student training scheme. This criterion matches segmentation masks\npredicted from regions of the input image extracted at different resolutions to\neach other. Experimental results on several benchmarks used in unsupervised\nsegmentation learning demonstrate the effectiveness of our proposed techniques.\n","authors":["Alp Eren Sari","Francesco Locatello","Paolo Favar"],"pdf_url":"https://arxiv.org/pdf/2404.03392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03384v1","updated":"2024-04-04T11:33:29Z","published":"2024-04-04T11:33:29Z","title":"LongVLM: Efficient Long Video Understanding via Large Language Models","summary":" Empowered by Large Language Models (LLMs), recent advancements in VideoLLMs\nhave driven progress in various video understanding tasks. These models encode\nvideo representations through pooling or query aggregation over a vast number\nof visual tokens, making computational and memory costs affordable. Despite\nsuccessfully providing an overall comprehension of video content, existing\nVideoLLMs still face challenges in achieving detailed understanding in videos\ndue to overlooking local information in long-term videos. To tackle this\nchallenge, we introduce LongVLM, a straightforward yet powerful VideoLLM for\nlong video understanding, building upon the observation that long videos often\nconsist of sequential key events, complex actions, and camera movements. Our\napproach proposes to decompose long videos into multiple short-term segments\nand encode local features for each local segment via a hierarchical token\nmerging module. These features are concatenated in temporal order to maintain\nthe storyline across sequential short-term segments. Additionally, we propose\nto integrate global semantics into each local feature to enhance context\nunderstanding. In this way, we encode video representations that incorporate\nboth local and global information, enabling the LLM to generate comprehensive\nresponses for long-term videos. Experimental results on the VideoChatGPT\nbenchmark and zero-shot video question-answering datasets demonstrate the\nsuperior capabilities of our model over the previous state-of-the-art methods.\nQualitative examples demonstrate that our model produces more precise responses\nfor long videos understanding. Code is available at\n\\url{https://github.com/ziplab/LongVLM}.\n","authors":["Yuetian Weng","Mingfei Han","Haoyu He","Xiaojun Chang","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2404.03384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03349v1","updated":"2024-04-04T10:30:28Z","published":"2024-04-04T10:30:28Z","title":"VF-NeRF: Viewshed Fields for Rigid NeRF Registration","summary":" 3D scene registration is a fundamental problem in computer vision that seeks\nthe best 6-DoF alignment between two scenes. This problem was extensively\ninvestigated in the case of point clouds and meshes, but there has been\nrelatively limited work regarding Neural Radiance Fields (NeRF). In this paper,\nwe consider the problem of rigid registration between two NeRFs when the\nposition of the original cameras is not given. Our key novelty is the\nintroduction of Viewshed Fields (VF), an implicit function that determines, for\neach 3D point, how likely it is to be viewed by the original cameras. We\ndemonstrate how VF can help in the various stages of NeRF registration, with an\nextensive evaluation showing that VF-NeRF achieves SOTA results on various\ndatasets with different capturing approaches such as LLFF and Objaverese.\n","authors":["Leo Segre","Shai Avidan"],"pdf_url":"https://arxiv.org/pdf/2404.03349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03340v1","updated":"2024-04-04T10:10:38Z","published":"2024-04-04T10:10:38Z","title":"Meta Invariance Defense Towards Generalizable Robustness to Unknown\n Adversarial Attacks","summary":" Despite providing high-performance solutions for computer vision tasks, the\ndeep neural network (DNN) model has been proved to be extremely vulnerable to\nadversarial attacks. Current defense mainly focuses on the known attacks, but\nthe adversarial robustness to the unknown attacks is seriously overlooked.\nBesides, commonly used adaptive learning and fine-tuning technique is\nunsuitable for adversarial defense since it is essentially a zero-shot problem\nwhen deployed. Thus, to tackle this challenge, we propose an attack-agnostic\ndefense method named Meta Invariance Defense (MID). Specifically, various\ncombinations of adversarial attacks are randomly sampled from a manually\nconstructed Attacker Pool to constitute different defense tasks against unknown\nattacks, in which a student encoder is supervised by multi-consistency\ndistillation to learn the attack-invariant features via a meta principle. The\nproposed MID has two merits: 1) Full distillation from pixel-, feature- and\nprediction-level between benign and adversarial samples facilitates the\ndiscovery of attack-invariance. 2) The model simultaneously achieves robustness\nto the imperceptible adversarial perturbations in high-level image\nclassification and attack-suppression in low-level robust image regeneration.\nTheoretical and empirical studies on numerous benchmarks such as ImageNet\nverify the generalizable robustness and superiority of MID under various\nattacks.\n","authors":["Lei Zhang","Yuhang Zhou","Yi Yang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2404.03340v1.pdf","comment":"Accepted by IEEE TPAMI in 2024"},{"id":"http://arxiv.org/abs/2404.03327v1","updated":"2024-04-04T09:53:00Z","published":"2024-04-04T09:53:00Z","title":"DI-Retinex: Digital-Imaging Retinex Theory for Low-Light Image\n Enhancement","summary":" Many existing methods for low-light image enhancement (LLIE) based on Retinex\ntheory ignore important factors that affect the validity of this theory in\ndigital imaging, such as noise, quantization error, non-linearity, and dynamic\nrange overflow. In this paper, we propose a new expression called\nDigital-Imaging Retinex theory (DI-Retinex) through theoretical and\nexperimental analysis of Retinex theory in digital imaging. Our new expression\nincludes an offset term in the enhancement model, which allows for pixel-wise\nbrightness contrast adjustment with a non-linear mapping function. In addition,\nto solve the lowlight enhancement problem in an unsupervised manner, we propose\nan image-adaptive masked reverse degradation loss in Gamma space. We also\ndesign a variance suppression loss for regulating the additional offset term.\nExtensive experiments show that our proposed method outperforms all existing\nunsupervised methods in terms of visual quality, model size, and speed. Our\nalgorithm can also assist downstream face detectors in low-light, as it shows\nthe most performance gain after the low-light enhancement compared to other\nmethods.\n","authors":["Shangquan Sun","Wenqi Ren","Jingyang Peng","Fenglong Song","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2404.03327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01064v2","updated":"2024-04-04T09:48:30Z","published":"2024-04-01T11:57:34Z","title":"Roadside Monocular 3D Detection via 2D Detection Prompting","summary":" The problem of roadside monocular 3D detection requires detecting objects of\ninterested classes in a 2D RGB frame and predicting their 3D information such\nas locations in bird's-eye-view (BEV). It has broad applications in traffic\ncontrol, vehicle-vehicle communication, and vehicle-infrastructure cooperative\nperception. To approach this problem, we present a novel and simple method by\nprompting the 3D detector using 2D detections. Our method builds on a key\ninsight that, compared with 3D detectors, a 2D detector is much easier to train\nand performs significantly better w.r.t detections on the 2D image plane. That\nsaid, one can exploit 2D detections of a well-trained 2D detector as prompts to\na 3D detector, being trained in a way of inflating such 2D detections to 3D\ntowards 3D detection. To construct better prompts using the 2D detector, we\nexplore three techniques: (a) concatenating both 2D and 3D detectors' features,\n(b) attentively fusing 2D and 3D detectors' features, and (c) encoding\npredicted 2D boxes x, y, width, height, label and attentively fusing such with\nthe 3D detector's features. Surprisingly, the third performs the best.\nMoreover, we present a yaw tuning tactic and a class-grouping strategy that\nmerges classes based on their functionality; these techniques improve 3D\ndetection performance further. Comprehensive ablation studies and extensive\nexperiments demonstrate that our method resoundingly outperforms prior works,\nachieving the state-of-the-art on two large-scale roadside 3D detection\nbenchmarks.\n","authors":["Yechi Ma","Shuoquan Wei","Churun Zhang","Wei Hua","Yanan Li","Shu Kong"],"pdf_url":"https://arxiv.org/pdf/2404.01064v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03323v1","updated":"2024-04-04T09:43:43Z","published":"2024-04-04T09:43:43Z","title":"Sparse Concept Bottleneck Models: Gumbel Tricks in Contrastive Learning","summary":" We propose a novel architecture and method of explainable classification with\nConcept Bottleneck Models (CBMs). While SOTA approaches to Image Classification\ntask work as a black box, there is a growing demand for models that would\nprovide interpreted results. Such a models often learn to predict the\ndistribution over class labels using additional description of this target\ninstances, called concepts. However, existing Bottleneck methods have a number\nof limitations: their accuracy is lower than that of a standard model and CBMs\nrequire an additional set of concepts to leverage. We provide a framework for\ncreating Concept Bottleneck Model from pre-trained multi-modal encoder and new\nCLIP-like architectures. By introducing a new type of layers known as Concept\nBottleneck Layers, we outline three methods for training them: with\n$\\ell_1$-loss, contrastive loss and loss function based on Gumbel-Softmax\ndistribution (Sparse-CBM), while final FC layer is still trained with\nCross-Entropy. We show a significant increase in accuracy using sparse hidden\nlayers in CLIP-based bottleneck models. Which means that sparse representation\nof concepts activation vector is meaningful in Concept Bottleneck Models.\nMoreover, with our Concept Matrix Search algorithm we can improve CLIP\npredictions on complex datasets without any additional training or fine-tuning.\nThe code is available at: https://github.com/Andron00e/SparseCBM.\n","authors":["Andrei Semenov","Vladimir Ivanov","Aleksandr Beznosikov","Alexander Gasnikov"],"pdf_url":"https://arxiv.org/pdf/2404.03323v1.pdf","comment":"23 pages, 1 algorithm, 36 figures"},{"id":"http://arxiv.org/abs/2310.00615v3","updated":"2024-04-04T09:18:50Z","published":"2023-10-01T08:32:46Z","title":"Scene-aware Human Motion Forecasting via Mutual Distance Prediction","summary":" In this paper, we tackle the problem of scene-aware 3D human motion\nforecasting. A key challenge of this task is to predict future human motions\nthat are consistent with the scene by modeling the human-scene interactions.\nWhile recent works have demonstrated that explicit constraints on human-scene\ninteractions can prevent the occurrence of ghost motion, they only provide\nconstraints on partial human motion e.g., the global motion of the human or a\nfew joints contacting the scene, leaving the rest of the motion unconstrained.\nTo address this limitation, we propose to model the human-scene interaction\nwith the mutual distance between the human body and the scene. Such mutual\ndistances constrain both the local and global human motion, resulting in a\nwhole-body motion constrained prediction. In particular, mutual distance\nconstraints consist of two components, the signed distance of each vertex on\nthe human mesh to the scene surface and the distance of basis scene points to\nthe human mesh. We further introduce a global scene representation learned from\na signed distance function (SDF) volume to ensure coherence between the global\nscene representation and the explicit constraint from the mutual distance. We\ndevelop a pipeline with two sequential steps: predicting the future mutual\ndistances first, followed by forecasting future human motion. During training,\nwe explicitly encourage consistency between predicted poses and mutual\ndistances. Extensive evaluations on the existing synthetic and real datasets\ndemonstrate that our approach consistently outperforms the state-of-the-art\nmethods.\n","authors":["Chaoyue Xing","Wei Mao","Miaomiao Liu"],"pdf_url":"https://arxiv.org/pdf/2310.00615v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19773v2","updated":"2024-04-04T09:05:49Z","published":"2024-03-28T18:50:19Z","title":"ShapeFusion: A 3D diffusion model for localized shape editing","summary":" In the realm of 3D computer vision, parametric models have emerged as a\nground-breaking methodology for the creation of realistic and expressive 3D\navatars. Traditionally, they rely on Principal Component Analysis (PCA), given\nits ability to decompose data to an orthonormal space that maximally captures\nshape variations. However, due to the orthogonality constraints and the global\nnature of PCA's decomposition, these models struggle to perform localized and\ndisentangled editing of 3D shapes, which severely affects their use in\napplications requiring fine control such as face sculpting. In this paper, we\nleverage diffusion models to enable diverse and fully localized edits on 3D\nmeshes, while completely preserving the un-edited regions. We propose an\neffective diffusion masking training strategy that, by design, facilitates\nlocalized manipulation of any shape region, without being limited to predefined\nregions or to sparse sets of predefined control vertices. Following our\nframework, a user can explicitly set their manipulation region of choice and\ndefine an arbitrary set of vertices as handles to edit a 3D mesh. Compared to\nthe current state-of-the-art our method leads to more interpretable shape\nmanipulations than methods relying on latent code state, greater localization\nand generation diversity while offering faster inference than optimization\nbased approaches. Project page: https://rolpotamias.github.io/Shapefusion/\n","authors":["Rolandos Alexandros Potamias","Michail Tarasiou","Stylianos Ploumpis","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2403.19773v2.pdf","comment":"Project Page: https://rolpotamias.github.io/Shapefusion/"},{"id":"http://arxiv.org/abs/2404.02614v2","updated":"2024-04-04T08:57:00Z","published":"2024-04-03T10:01:23Z","title":"Vestibular schwannoma growth prediction from longitudinal MRI by time\n conditioned neural fields","summary":" Vestibular schwannomas (VS) are benign tumors that are generally managed by\nactive surveillance with MRI examination. To further assist clinical\ndecision-making and avoid overtreatment, an accurate prediction of tumor growth\nbased on longitudinal imaging is highly desirable. In this paper, we introduce\nDeepGrowth, a deep learning method that incorporates neural fields and\nrecurrent neural networks for prospective tumor growth prediction. In the\nproposed method, each tumor is represented as a signed distance function (SDF)\nconditioned on a low-dimensional latent code. Unlike previous studies that\nperform tumor shape prediction directly in the image space, we predict the\nlatent codes instead and then reconstruct future shapes from it. To deal with\nirregular time intervals, we introduce a time-conditioned recurrent module\nbased on a ConvLSTM and a novel temporal encoding strategy, which enables the\nproposed model to output varying tumor shapes over time. The experiments on an\nin-house longitudinal VS dataset showed that the proposed model significantly\nimproved the performance ($\\ge 1.6\\%$ Dice score and $\\ge0.20$ mm 95\\%\nHausdorff distance), in particular for top 20\\% tumors that grow or shrink the\nmost ($\\ge 4.6\\%$ Dice score and $\\ge 0.73$ mm 95\\% Hausdorff distance). Our\ncode is available at ~\\burl{https://github.com/cyjdswx/DeepGrowth}\n","authors":["Yunjie Chen","Jelmer M. Wolterink","Olaf M. Neve","Stephan R. Romeijn","Berit M. Verbist","Erik F. Hensen","Qian Tao","Marius Staring"],"pdf_url":"https://arxiv.org/pdf/2404.02614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02282v2","updated":"2024-04-04T08:38:17Z","published":"2024-04-02T20:15:43Z","title":"Smooth Deep Saliency","summary":" In this work, we investigate methods to reduce the noise in deep saliency\nmaps coming from convolutional downsampling, with the purpose of explaining how\na deep learning model detects tumors in scanned histological tissue samples.\nThose methods make the investigated models more interpretable for\ngradient-based saliency maps, computed in hidden layers. We test our approach\non different models trained for image classification on ImageNet1K, and models\ntrained for tumor detection on Camelyon16 and in-house real-world digital\npathology scans of stained tissue samples. Our results show that the\ncheckerboard noise in the gradient gets reduced, resulting in smoother and\ntherefore easier to interpret saliency maps.\n","authors":["Rudolf Herdt","Maximilian Schmidt","Daniel Otero Baguer","Peter Maaß"],"pdf_url":"https://arxiv.org/pdf/2404.02282v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03296v1","updated":"2024-04-04T08:37:27Z","published":"2024-04-04T08:37:27Z","title":"AdaBM: On-the-Fly Adaptive Bit Mapping for Image Super-Resolution","summary":" Although image super-resolution (SR) problem has experienced unprecedented\nrestoration accuracy with deep neural networks, it has yet limited versatile\napplications due to the substantial computational costs. Since different input\nimages for SR face different restoration difficulties, adapting computational\ncosts based on the input image, referred to as adaptive inference, has emerged\nas a promising solution to compress SR networks. Specifically, adapting the\nquantization bit-widths has successfully reduced the inference and memory cost\nwithout sacrificing the accuracy. However, despite the benefits of the\nresultant adaptive network, existing works rely on time-intensive\nquantization-aware training with full access to the original training pairs to\nlearn the appropriate bit allocation policies, which limits its ubiquitous\nusage. To this end, we introduce the first on-the-fly adaptive quantization\nframework that accelerates the processing time from hours to seconds. We\nformulate the bit allocation problem with only two bit mapping modules: one to\nmap the input image to the image-wise bit adaptation factor and one to obtain\nthe layer-wise adaptation factors. These bit mappings are calibrated and\nfine-tuned using only a small number of calibration images. We achieve\ncompetitive performance with the previous adaptive quantization methods, while\nthe processing time is accelerated by x2000. Codes are available at\nhttps://github.com/Cheeun/AdaBM.\n","authors":["Cheeun Hong","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2404.03296v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2211.07459v2","updated":"2024-04-04T08:24:54Z","published":"2022-11-14T15:37:27Z","title":"Self-Aligning Depth-regularized Radiance Fields for Asynchronous RGB-D\n Sequences","summary":" It has been shown that learning radiance fields with depth rendering and\ndepth supervision can effectively promote the quality and convergence of view\nsynthesis. However, this paradigm requires input RGB-D sequences to be\nsynchronized, hindering its usage in the UAV city modeling scenario. As there\nexists asynchrony between RGB images and depth images due to high-speed flight,\nwe propose a novel time-pose function, which is an implicit network that maps\ntimestamps to $\\rm SE(3)$ elements. To simplify the training process, we also\ndesign a joint optimization scheme to jointly learn the large-scale\ndepth-regularized radiance fields and the time-pose function. Our algorithm\nconsists of three steps: (1) time-pose function fitting, (2) radiance field\nbootstrapping, (3) joint pose error compensation and radiance field refinement.\nIn addition, we propose a large synthetic dataset with diverse controlled\nmismatches and ground truth to evaluate this new problem setting\nsystematically. Through extensive experiments, we demonstrate that our method\noutperforms baselines without regularization. We also show qualitatively\nimproved results on a real-world asynchronous RGB-D sequence captured by drone.\nCodes, data, and models will be made publicly available.\n","authors":["Yuxin Huang","Andong Yang","Zirui Wu","Yuantao Chen","Runyi Yang","Zhenxin Zhu","Chao Hou","Hao Zhao","Guyue Zhou"],"pdf_url":"https://arxiv.org/pdf/2211.07459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01692v2","updated":"2024-04-04T08:07:22Z","published":"2024-04-02T06:52:31Z","title":"Beyond Image Super-Resolution for Image Recognition with Task-Driven\n Perceptual Loss","summary":" In real-world scenarios, image recognition tasks, such as semantic\nsegmentation and object detection, often pose greater challenges due to the\nlack of information available within low-resolution (LR) content. Image\nsuper-resolution (SR) is one of the promising solutions for addressing the\nchallenges. However, due to the ill-posed property of SR, it is challenging for\ntypical SR methods to restore task-relevant high-frequency contents, which may\ndilute the advantage of utilizing the SR method. Therefore, in this paper, we\npropose Super-Resolution for Image Recognition (SR4IR) that effectively guides\nthe generation of SR images beneficial to achieving satisfactory image\nrecognition performance when processing LR images. The critical component of\nour SR4IR is the task-driven perceptual (TDP) loss that enables the SR network\nto acquire task-specific knowledge from a network tailored for a specific task.\nMoreover, we propose a cross-quality patch mix and an alternate training\nframework that significantly enhances the efficacy of the TDP loss by\naddressing potential problems when employing the TDP loss. Through extensive\nexperiments, we demonstrate that our SR4IR achieves outstanding task\nperformance by generating SR images useful for a specific image recognition\ntask, including semantic segmentation, object detection, and image\nclassification. The implementation code is available at\nhttps://github.com/JaehaKim97/SR4IR.\n","authors":["Jaeha Kim","Junghun Oh","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2404.01692v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17369v2","updated":"2024-04-04T08:05:06Z","published":"2024-03-26T04:09:08Z","title":"CoDA: Instructive Chain-of-Domain Adaptation with Severity-Aware Visual\n Prompt Tuning","summary":" Unsupervised Domain Adaptation (UDA) aims to adapt models from labeled source\ndomains to unlabeled target domains. When adapting to adverse scenes, existing\nUDA methods fail to perform well due to the lack of instructions, leading their\nmodels to overlook discrepancies within all adverse scenes. To tackle this, we\npropose CoDA which instructs models to distinguish, focus, and learn from these\ndiscrepancies at scene and image levels. Specifically, CoDA consists of a\nChain-of-Domain (CoD) strategy and a Severity-Aware Visual Prompt Tuning\n(SAVPT) mechanism. CoD focuses on scene-level instructions to divide all\nadverse scenes into easy and hard scenes, guiding models to adapt from source\nto easy domains with easy scene images, and then to hard domains with hard\nscene images, thereby laying a solid foundation for whole adaptations. Building\nupon this foundation, we employ SAVPT to dive into more detailed image-level\ninstructions to boost performance. SAVPT features a novel metric Severity that\ndivides all adverse scene images into low-severity and high-severity images.\nThen Severity directs visual prompts and adapters, instructing models to\nconcentrate on unified severity features instead of scene-specific features,\nwithout adding complexity to the model architecture. CoDA achieves SOTA\nperformances on widely-used benchmarks under all adverse scenes. Notably, CoDA\noutperforms the existing ones by 4.6%, and 10.3% mIoU on the Foggy Driving, and\nFoggy Zurich benchmarks, respectively. Our code is available at\nhttps://github.com/Cuzyoung/CoDA\n","authors":["Ziyang Gong","Fuhao Li","Yupeng Deng","Deblina Bhattacharjee","Xiangwei Zhu","Zhenming Ji"],"pdf_url":"https://arxiv.org/pdf/2403.17369v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03277v1","updated":"2024-04-04T08:04:00Z","published":"2024-04-04T08:04:00Z","title":"Design and Development of a Framework For Stroke-Based Handwritten\n Gujarati Font Generation","summary":" Handwritten font generation is important for preserving cultural heritage and\ncreating personalized designs. It adds an authentic and expressive touch to\nprinted materials, making them visually appealing and establishing a stronger\nconnection with the audience. This paper aims to design a framework for\ngenerating handwritten fonts in the Gujarati script, mimicking the variation of\nhuman handwriting. The proposed font generation model consists of a learning\nphase and a generation phase. In the learning phase, Gujarati scripts are\nanalyzed, and rules for designing each character are formulated. This ruleset\ninvolves the concatenation of strokes in a stroke-based manner, ensuring visual\nconsistency in the resulting glyphs. The generation phase involves the user\nproviding a small subset of characters, and the system automatically generates\nthe remaining character glyphs based on extracted strokes and learned rules,\nresulting in handwritten Gujarati fonts. The resulting character glyphs are\nconverted into an open-type font using the FontForge tool, making them\ncompatible with any Gujarati editor. Both subjective and objective evaluations\nare conducted to assess the synthesized images and fonts. Subjective evaluation\nthrough user studies provides feedback on quality and visual appeal, achieving\nan overall accuracy of 84.84%. Notably, eleven characters demonstrated a\nsuccess ratio above 90%. Objective evaluation using an existing recognition\nsystem achieves an overall accuracy of 84.28% in OCR evaluation. Notably,\nfifteen characters had a success ratio of 80% or higher.\n","authors":["Preeti P. Bhatt","Jitendra V. Nasriwala","Rakesh R. Savant"],"pdf_url":"https://arxiv.org/pdf/2404.03277v1.pdf","comment":"13 pages, 2 column, 12 figures"},{"id":"http://arxiv.org/abs/2404.01758v2","updated":"2024-04-04T08:03:04Z","published":"2024-04-02T09:18:52Z","title":"GEARS: Local Geometry-aware Hand-object Interaction Synthesis","summary":" Generating realistic hand motion sequences in interaction with objects has\ngained increasing attention with the growing interest in digital humans. Prior\nwork has illustrated the effectiveness of employing occupancy-based or\ndistance-based virtual sensors to extract hand-object interaction features.\nNonetheless, these methods show limited generalizability across object\ncategories, shapes and sizes. We hypothesize that this is due to two reasons:\n1) the limited expressiveness of employed virtual sensors, and 2) scarcity of\navailable training data. To tackle this challenge, we introduce a novel\njoint-centered sensor designed to reason about local object geometry near\npotential interaction regions. The sensor queries for object surface points in\nthe neighbourhood of each hand joint. As an important step towards mitigating\nthe learning complexity, we transform the points from global frame to hand\ntemplate frame and use a shared module to process sensor features of each\nindividual joint. This is followed by a spatio-temporal transformer network\naimed at capturing correlation among the joints in different dimensions.\nMoreover, we devise simple heuristic rules to augment the limited training\nsequences with vast static hand grasping samples. This leads to a broader\nspectrum of grasping types observed during training, in turn enhancing our\nmodel's generalization capability. We evaluate on two public datasets, GRAB and\nInterCap, where our method shows superiority over baselines both quantitatively\nand perceptually.\n","authors":["Keyang Zhou","Bharat Lal Bhatnagar","Jan Eric Lenssen","Gerard Pons-moll"],"pdf_url":"https://arxiv.org/pdf/2404.01758v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02935v3","updated":"2024-04-04T07:56:59Z","published":"2023-08-05T18:32:49Z","title":"Bias Behind the Wheel: Fairness Analysis of Autonomous Driving Systems","summary":" This paper analyzes fairness in automated pedestrian detection, a crucial but\nunder-explored issue in autonomous driving systems. We evaluate eight\nstate-of-the-art deep learning-based pedestrian detectors across demographic\ngroups on large-scale real-world datasets. To enable thorough fairness testing,\nwe provide extensive annotations for the datasets, resulting in 8,311 images\nwith 16,070 gender labels, 20,115 age labels, and 3,513 skin tone labels. Our\nfindings reveal significant fairness issues, particularly related to age. The\nundetected proportions for children are 20.14% higher compared to adults.\nFurthermore, we explore how various driving scenarios affect the fairness of\npedestrian detectors. We find that pedestrian detectors demonstrate significant\ngender biases during night time, potentially exacerbating the prevalent\nsocietal issue of female safety concerns during nighttime out. Moreover, we\nobserve that pedestrian detectors can demonstrate both enhanced fairness and\nsuperior performance under specific driving conditions, which challenges the\nfairness-performance trade-off theory widely acknowledged in the fairness\nliterature. We publicly release the code, data, and results to support future\nresearch on fairness in autonomous driving.\n","authors":["Xinyue Li","Zhenpeng Chen","Jie M. Zhang","Federica Sarro","Ying Zhang","Xuanzhe Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02935v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03256v1","updated":"2024-04-04T07:26:26Z","published":"2024-04-04T07:26:26Z","title":"Multi Positive Contrastive Learning with Pose-Consistent Generated\n Images","summary":" Model pre-training has become essential in various recognition tasks.\nMeanwhile, with the remarkable advancements in image generation models,\npre-training methods utilizing generated images have also emerged given their\nability to produce unlimited training data. However, while existing methods\nutilizing generated images excel in classification, they fall short in more\npractical tasks, such as human pose estimation. In this paper, we have\nexperimentally demonstrated it and propose the generation of visually distinct\nimages with identical human poses. We then propose a novel multi-positive\ncontrastive learning, which optimally utilize the previously generated images\nto learn structural features of the human body. We term the entire learning\npipeline as GenPoCCL. Despite using only less than 1% amount of data compared\nto current state-of-the-art method, GenPoCCL captures structural features of\nthe human body more effectively, surpassing existing methods in a variety of\nhuman-centric perception tasks.\n","authors":["Sho Inayoshi","Aji Resindra Widya","Satoshi Ozaki","Junji Otsuka","Takeshi Ohashi"],"pdf_url":"https://arxiv.org/pdf/2404.03256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03253v1","updated":"2024-04-04T07:19:31Z","published":"2024-04-04T07:19:31Z","title":"A dataset of primary nasopharyngeal carcinoma MRI with multi-modalities\n segmentation","summary":" Multi-modality magnetic resonance imaging data with various sequences\nfacilitate the early diagnosis, tumor segmentation, and disease staging in the\nmanagement of nasopharyngeal carcinoma (NPC). The lack of publicly available,\ncomprehensive datasets limits advancements in diagnosis, treatment planning,\nand the development of machine learning algorithms for NPC. Addressing this\ncritical need, we introduce the first comprehensive NPC MRI dataset,\nencompassing MR axial imaging of 277 primary NPC patients. This dataset\nincludes T1-weighted, T2-weighted, and contrast-enhanced T1-weighted sequences,\ntotaling 831 scans. In addition to the corresponding clinical data, manually\nannotated and labeled segmentations by experienced radiologists offer\nhigh-quality data resources from untreated primary NPC.\n","authors":["Yin Li","Qi Chen","Kai Wang","Meige Li","Liping Si","Yingwei Guo","Yu Xiong","Qixing Wang","Yang Qin","Ling Xu","Patrick van der Smagt","Jun Tang","Nutan Chen"],"pdf_url":"https://arxiv.org/pdf/2404.03253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03251v1","updated":"2024-04-04T07:14:12Z","published":"2024-04-04T07:14:12Z","title":"Real-time Noise Source Estimation of a Camera System from an Image and\n Metadata","summary":" Autonomous machines must self-maintain proper functionality to ensure the\nsafety of humans and themselves. This pertains particularly to its cameras as\npredominant sensors to perceive the environment and support actions. A\nfundamental camera problem addressed in this study is noise. Solutions often\nfocus on denoising images a posteriori, that is, fighting symptoms rather than\nroot causes. However, tackling root causes requires identifying the noise\nsources, considering the limitations of mobile platforms. This work\ninvestigates a real-time, memory-efficient and reliable noise source estimator\nthat combines data- and physically-based models. To this end, a DNN that\nexamines an image with camera metadata for major camera noise sources is built\nand trained. In addition, it quantifies unexpected factors that impact image\nnoise or metadata. This study investigates seven different estimators on six\ndatasets that include synthetic noise, real-world noise from two camera\nsystems, and real field campaigns. For these, only the model with most metadata\nis capable to accurately and robustly quantify all individual noise\ncontributions. This method outperforms total image noise estimators and can be\nplug-and-play deployed. It also serves as a basis to include more advanced\nnoise sources, or as part of an automatic countermeasure feedback-loop to\napproach fully reliable machines.\n","authors":["Maik Wischow","Patrick Irmisch","Anko Boerner","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2404.03251v1.pdf","comment":"16 pages, 16 figures, 12 tables, Project page:\n https://github.com/MaikWischow/Noise-Source-Estimation"},{"id":"http://arxiv.org/abs/2404.03248v1","updated":"2024-04-04T07:07:34Z","published":"2024-04-04T07:07:34Z","title":"Learning Transferable Negative Prompts for Out-of-Distribution Detection","summary":" Existing prompt learning methods have shown certain capabilities in\nOut-of-Distribution (OOD) detection, but the lack of OOD images in the target\ndataset in their training can lead to mismatches between OOD images and\nIn-Distribution (ID) categories, resulting in a high false positive rate. To\naddress this issue, we introduce a novel OOD detection method, named\n'NegPrompt', to learn a set of negative prompts, each representing a negative\nconnotation of a given class label, for delineating the boundaries between ID\nand OOD images. It learns such negative prompts with ID data only, without any\nreliance on external outlier data. Further, current methods assume the\navailability of samples of all ID classes, rendering them ineffective in\nopen-vocabulary learning scenarios where the inference stage can contain novel\nID classes not present during training. In contrast, our learned negative\nprompts are transferable to novel class labels. Experiments on various ImageNet\nbenchmarks show that NegPrompt surpasses state-of-the-art prompt-learning-based\nOOD detection methods and maintains a consistent lead in hard OOD detection in\nclosed- and open-vocabulary classification scenarios. Code is available at\nhttps://github.com/mala-lab/negprompt.\n","authors":["Tianqi Li","Guansong Pang","Xiao Bai","Wenjun Miao","Jin Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.03248v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2306.10482v2","updated":"2024-04-04T07:00:56Z","published":"2023-06-18T05:37:38Z","title":"Weighted structure tensor total variation for image denoising","summary":" For image denoising problems, the structure tensor total variation\n(STV)-based models show good performances when compared with other competing\nregularization approaches. However, the STV regularizer does not couple the\nlocal information of the image and may not maintain the image details.\nTherefore, we employ the anisotropic weighted matrix introduced in the\nanisotropic total variation (ATV) model to improve the STV model. By applying\nthe weighted matrix to the discrete gradient of the patch-based Jacobian\noperator in STV, our proposed weighted STV (WSTV) model can effectively capture\nlocal information from images and maintain their details during the denoising\nprocess. The optimization problem in the model is solved by a fast first-order\ngradient projection algorithm with a complexity result of $O(1 / i^2)$. For\nimages with different Gaussian noise levels, the experimental results\ndemonstrate that the WSTV model can effectively improve the quality of restored\nimages compared to other TV and STV-based models.\n","authors":["Xiuhan Sheng","Lijuan Yang","Jingya Chang"],"pdf_url":"https://arxiv.org/pdf/2306.10482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03242v1","updated":"2024-04-04T06:58:39Z","published":"2024-04-04T06:58:39Z","title":"Would Deep Generative Models Amplify Bias in Future Models?","summary":" We investigate the impact of deep generative models on potential social\nbiases in upcoming computer vision models. As the internet witnesses an\nincreasing influx of AI-generated images, concerns arise regarding inherent\nbiases that may accompany them, potentially leading to the dissemination of\nharmful content. This paper explores whether a detrimental feedback loop,\nresulting in bias amplification, would occur if generated images were used as\nthe training data for future models. We conduct simulations by progressively\nsubstituting original images in COCO and CC3M datasets with images generated\nthrough Stable Diffusion. The modified datasets are used to train OpenCLIP and\nimage captioning models, which we evaluate in terms of quality and bias.\nContrary to expectations, our findings indicate that introducing generated\nimages during training does not uniformly amplify bias. Instead, instances of\nbias mitigation across specific tasks are observed. We further explore the\nfactors that may influence these phenomena, such as artifacts in image\ngeneration (e.g., blurry faces) or pre-existing biases in the original\ndatasets.\n","authors":["Tianwei Chen","Yusuke Hirota","Mayu Otani","Noa Garcia","Yuta Nakashima"],"pdf_url":"https://arxiv.org/pdf/2404.03242v1.pdf","comment":"This paper has been accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2401.12433v2","updated":"2024-04-04T06:57:39Z","published":"2024-01-23T01:52:49Z","title":"A Novel Garment Transfer Method Supervised by Distilled Knowledge of\n Virtual Try-on Model","summary":" This paper proposes a novel garment transfer method supervised with knowledge\ndistillation from virtual try-on. Our method first reasons the transfer parsing\nto provide shape prior to downstream tasks. We employ a multi-phase teaching\nstrategy to supervise the training of the transfer parsing reasoning model,\nlearning the response and feature knowledge from the try-on parsing reasoning\nmodel. To correct the teaching error, it transfers the garment back to its\nowner to absorb the hard knowledge in the self-study phase. Guided by the\ntransfer parsing, we adjust the position of the transferred garment via STN to\nprevent distortion. Afterward, we estimate a progressive flow to precisely warp\nthe garment with shape and content correspondences. To ensure warping\nrationality, we supervise the training of the garment warping model using\ntarget shape and warping knowledge from virtual try-on. To better preserve body\nfeatures in the transfer result, we propose a well-designed training strategy\nfor the arm regrowth task to infer new exposure skin. Experiments demonstrate\nthat our method has state-of-the-art performance compared with other virtual\ntry-on and garment transfer methods in garment transfer, especially for\npreserving garment texture and body features.\n","authors":["Naiyu Fang","Lemiao Qiu","Shuyou Zhang","Zili Wang","Kerui Hu","Jianrong Tan"],"pdf_url":"https://arxiv.org/pdf/2401.12433v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06395v2","updated":"2024-04-04T06:46:42Z","published":"2024-01-12T06:28:54Z","title":"ModaVerse: Efficiently Transforming Modalities with LLMs","summary":" Humans possess the capability to comprehend diverse modalities and seamlessly\ntransfer information between them. In this work, we introduce ModaVerse, a\nMulti-modal Large Language Model (MLLM) capable of comprehending and\ntransforming content across various modalities including images, videos, and\naudio. Predominant MLLM frameworks have largely relied on the alignment of\nlatent spaces of textual and non-textual features. This alignment process,\nwhich synchronizes a language model trained on textual data with encoders and\ndecoders trained on multi-modal data, often necessitates extensive training of\nseveral projection layers in multiple stages. Inspired by LLM-as-agent\nmethodologies, we propose a novel Input/Output (I/O) alignment mechanism that\noperates directly at the level of natural language. It aligns the LLM's output\nwith the input of generative models, avoiding the complexities associated with\nlatent feature alignments, and simplifying the multiple training stages of\nexisting MLLMs into a single, efficient process. This conceptual advancement\nleads to significant reductions in both data and computational costs. By\nconducting experiments on several benchmarks, we demonstrate that our approach\nattains comparable performance with the state of the art while achieving\nconsiderable efficiencies in data usage and training duration.\n","authors":["Xinyu Wang","Bohan Zhuang","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2401.06395v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.03225v1","updated":"2024-04-04T06:20:22Z","published":"2024-04-04T06:20:22Z","title":"FACTUAL: A Novel Framework for Contrastive Learning Based Robust SAR\n Image Classification","summary":" Deep Learning (DL) Models for Synthetic Aperture Radar (SAR) Automatic Target\nRecognition (ATR), while delivering improved performance, have been shown to be\nquite vulnerable to adversarial attacks. Existing works improve robustness by\ntraining models on adversarial samples. However, by focusing mostly on attacks\nthat manipulate images randomly, they neglect the real-world feasibility of\nsuch attacks. In this paper, we propose FACTUAL, a novel Contrastive Learning\nframework for Adversarial Training and robust SAR classification. FACTUAL\nconsists of two components: (1) Differing from existing works, a novel\nperturbation scheme that incorporates realistic physical adversarial attacks\n(such as OTSA) to build a supervised adversarial pre-training network. This\nnetwork utilizes class labels for clustering clean and perturbed images\ntogether into a more informative feature space. (2) A linear classifier\ncascaded after the encoder to use the computed representations to predict the\ntarget labels. By pre-training and fine-tuning our model on both clean and\nadversarial samples, we show that our model achieves high prediction accuracy\non both cases. Our model achieves 99.7% accuracy on clean samples, and 89.6% on\nperturbed samples, both outperforming previous state-of-the-art methods.\n","authors":["Xu Wang","Tian Ye","Rajgopal Kannan","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2404.03225v1.pdf","comment":"2024 IEEE Radar Conference"},{"id":"http://arxiv.org/abs/2404.03219v1","updated":"2024-04-04T05:54:19Z","published":"2024-04-04T05:54:19Z","title":"iSeg: Interactive 3D Segmentation via Interactive Attention","summary":" We present iSeg, a new interactive technique for segmenting 3D shapes.\nPrevious works have focused mainly on leveraging pre-trained 2D foundation\nmodels for 3D segmentation based on text. However, text may be insufficient for\naccurately describing fine-grained spatial segmentations. Moreover, achieving a\nconsistent 3D segmentation using a 2D model is challenging since occluded areas\nof the same semantic region may not be visible together from any 2D view. Thus,\nwe design a segmentation method conditioned on fine user clicks, which operates\nentirely in 3D. Our system accepts user clicks directly on the shape's surface,\nindicating the inclusion or exclusion of regions from the desired shape\npartition. To accommodate various click settings, we propose a novel\ninteractive attention module capable of processing different numbers and types\nof clicks, enabling the training of a single unified interactive segmentation\nmodel. We apply iSeg to a myriad of shapes from different domains,\ndemonstrating its versatility and faithfulness to the user's specifications.\nOur project page is at https://threedle.github.io/iSeg/.\n","authors":["Itai Lang","Fei Xu","Dale Decatur","Sudarshan Babu","Rana Hanocka"],"pdf_url":"https://arxiv.org/pdf/2404.03219v1.pdf","comment":"Project page: https://threedle.github.io/iSeg/"},{"id":"http://arxiv.org/abs/2404.03214v1","updated":"2024-04-04T05:39:09Z","published":"2024-04-04T05:39:09Z","title":"LeGrad: An Explainability Method for Vision Transformers via Feature\n Formation Sensitivity","summary":" Vision Transformers (ViTs), with their ability to model long-range\ndependencies through self-attention mechanisms, have become a standard\narchitecture in computer vision. However, the interpretability of these models\nremains a challenge. To address this, we propose LeGrad, an explainability\nmethod specifically designed for ViTs. LeGrad computes the gradient with\nrespect to the attention maps of ViT layers, considering the gradient itself as\nthe explainability signal. We aggregate the signal over all layers, combining\nthe activations of the last as well as intermediate tokens to produce the\nmerged explainability map. This makes LeGrad a conceptually simple and an\neasy-to-implement tool for enhancing the transparency of ViTs. We evaluate\nLeGrad in challenging segmentation, perturbation, and open-vocabulary settings,\nshowcasing its versatility compared to other SotA explainability methods\ndemonstrating its superior spatial fidelity and robustness to perturbations. A\ndemo and the code is available at https://github.com/WalBouss/LeGrad.\n","authors":["Walid Bousselham","Angie Boggust","Sofian Chaybouti","Hendrik Strobelt","Hilde Kuehne"],"pdf_url":"https://arxiv.org/pdf/2404.03214v1.pdf","comment":"Code available at https://github.com/WalBouss/LeGrad"},{"id":"http://arxiv.org/abs/2404.03210v1","updated":"2024-04-04T05:33:06Z","published":"2024-04-04T05:33:06Z","title":"HDR Imaging for Dynamic Scenes with Events","summary":" High dynamic range imaging (HDRI) for real-world dynamic scenes is\nchallenging because moving objects may lead to hybrid degradation of low\ndynamic range and motion blur. Existing event-based approaches only focus on a\nseparate task, while cascading HDRI and motion deblurring would lead to\nsub-optimal solutions, and unavailable ground-truth sharp HDR images aggravate\nthe predicament. To address these challenges, we propose an Event-based HDRI\nframework within a Self-supervised learning paradigm, i.e., Self-EHDRI, which\ngeneralizes HDRI performance in real-world dynamic scenarios. Specifically, a\nself-supervised learning strategy is carried out by learning cross-domain\nconversions from blurry LDR images to sharp LDR images, which enables sharp HDR\nimages to be accessible in the intermediate process even though ground-truth\nsharp HDR images are missing. Then, we formulate the event-based HDRI and\nmotion deblurring model and conduct a unified network to recover the\nintermediate sharp HDR results, where both the high dynamic range and high\ntemporal resolution of events are leveraged simultaneously for compensation. We\nconstruct large-scale synthetic and real-world datasets to evaluate the\neffectiveness of our method. Comprehensive experiments demonstrate that the\nproposed Self-EHDRI outperforms state-of-the-art approaches by a large margin.\nThe codes, datasets, and results are available at\nhttps://lxp-whu.github.io/Self-EHDRI.\n","authors":["Li Xiaopeng","Zeng Zhaoyuan","Fan Cien","Zhao Chen","Deng Lei","Yu Lei"],"pdf_url":"https://arxiv.org/pdf/2404.03210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03202v1","updated":"2024-04-04T05:10:26Z","published":"2024-04-04T05:10:26Z","title":"OmniGS: Omnidirectional Gaussian Splatting for Fast Radiance Field\n Reconstruction using Omnidirectional Images","summary":" Photorealistic reconstruction relying on 3D Gaussian Splatting has shown\npromising potential in robotics. However, the current 3D Gaussian Splatting\nsystem only supports radiance field reconstruction using undistorted\nperspective images. In this paper, we present OmniGS, a novel omnidirectional\nGaussian splatting system, to take advantage of omnidirectional images for fast\nradiance field reconstruction. Specifically, we conduct a theoretical analysis\nof spherical camera model derivatives in 3D Gaussian Splatting. According to\nthe derivatives, we then implement a new GPU-accelerated omnidirectional\nrasterizer that directly splats 3D Gaussians onto the equirectangular screen\nspace for omnidirectional image rendering. As a result, we realize\ndifferentiable optimization of the radiance field without the requirement of\ncube-map rectification or tangent-plane approximation. Extensive experiments\nconducted in egocentric and roaming scenarios demonstrate that our method\nachieves state-of-the-art reconstruction quality and high rendering speed using\nomnidirectional images. To benefit the research community, the code will be\nmade publicly available once the paper is published.\n","authors":["Longwei Li","Huajian Huang","Sai-Kit Yeung","Hui Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03202v1.pdf","comment":"IROS 2024 submission, 7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.03200v1","updated":"2024-04-04T05:08:51Z","published":"2024-04-04T05:08:51Z","title":"Future-Proofing Class Incremental Learning","summary":" Exemplar-Free Class Incremental Learning is a highly challenging setting\nwhere replay memory is unavailable. Methods relying on frozen feature\nextractors have drawn attention recently in this setting due to their\nimpressive performances and lower computational costs. However, those methods\nare highly dependent on the data used to train the feature extractor and may\nstruggle when an insufficient amount of classes are available during the first\nincremental step. To overcome this limitation, we propose to use a pre-trained\ntext-to-image diffusion model in order to generate synthetic images of future\nclasses and use them to train the feature extractor. Experiments on the\nstandard benchmarks CIFAR100 and ImageNet-Subset demonstrate that our proposed\nmethod can be used to improve state-of-the-art methods for exemplar-free class\nincremental learning, especially in the most difficult settings where the first\nincremental step only contains few classes. Moreover, we show that using\nsynthetic samples of future classes achieves higher performance than using real\ndata from different classes, paving the way for better and less costly\npre-training methods for incremental learning.\n","authors":["Quentin Jodelet","Xin Liu","Yin Jun Phua","Tsuyoshi Murata"],"pdf_url":"https://arxiv.org/pdf/2404.03200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.09934v7","updated":"2024-04-04T04:52:43Z","published":"2022-07-20T14:20:35Z","title":"DeepIPC: Deeply Integrated Perception and Control for an Autonomous\n Vehicle in Real Environments","summary":" In this work, we introduce DeepIPC, a novel end-to-end model tailored for\nautonomous driving, which seamlessly integrates perception and control tasks.\nUnlike traditional models that handle these tasks separately, DeepIPC\ninnovatively combines a perception module, which processes RGBD images for\nsemantic segmentation and generates bird's eye view (BEV) mappings, with a\ncontroller module that utilizes these insights along with GNSS and angular\nspeed measurements to accurately predict navigational waypoints. This\nintegration allows DeepIPC to efficiently translate complex environmental data\ninto actionable driving commands. Our comprehensive evaluation demonstrates\nDeepIPC's superior performance in terms of drivability and multi-task\nefficiency across diverse real-world scenarios, setting a new benchmark for\nend-to-end autonomous driving systems with a leaner model architecture. The\nexperimental results underscore DeepIPC's potential to significantly enhance\nautonomous vehicular navigation, promising a step forward in the development of\nautonomous driving technologies. For further insights and replication, we will\nmake our code and datasets available at https://github.com/oskarnatan/DeepIPC.\n","authors":["Oskar Natan","Jun Miura"],"pdf_url":"https://arxiv.org/pdf/2207.09934v7.pdf","comment":"Accepted for Publication in IEEE Access"},{"id":"http://arxiv.org/abs/2404.02388v2","updated":"2024-04-04T04:23:10Z","published":"2024-04-03T01:13:05Z","title":"CAPE: CAM as a Probabilistic Ensemble for Enhanced DNN Interpretation","summary":" Deep Neural Networks (DNNs) are widely used for visual classification tasks,\nbut their complex computation process and black-box nature hinder decision\ntransparency and interpretability. Class activation maps (CAMs) and recent\nvariants provide ways to visually explain the DNN decision-making process by\ndisplaying 'attention' heatmaps of the DNNs. Nevertheless, the CAM explanation\nonly offers relative attention information, that is, on an attention heatmap,\nwe can interpret which image region is more or less important than the others.\nHowever, these regions cannot be meaningfully compared across classes, and the\ncontribution of each region to the model's class prediction is not revealed. To\naddress these challenges that ultimately lead to better DNN Interpretation, in\nthis paper, we propose CAPE, a novel reformulation of CAM that provides a\nunified and probabilistically meaningful assessment of the contributions of\nimage regions. We quantitatively and qualitatively compare CAPE with\nstate-of-the-art CAM methods on CUB and ImageNet benchmark datasets to\ndemonstrate enhanced interpretability. We also test on a cytology imaging\ndataset depicting a challenging Chronic Myelomonocytic Leukemia (CMML)\ndiagnosis problem. Code is available at: https://github.com/AIML-MED/CAPE.\n","authors":["Townim Faisal Chowdhury","Kewen Liao","Vu Minh Hieu Phan","Minh-Son To","Yutong Xie","Kevin Hung","David Ross","Anton van den Hengel","Johan W. Verjans","Zhibin Liao"],"pdf_url":"https://arxiv.org/pdf/2404.02388v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03191v1","updated":"2024-04-04T04:22:50Z","published":"2024-04-04T04:22:50Z","title":"CORP: A Multi-Modal Dataset for Campus-Oriented Roadside Perception\n Tasks","summary":" Numerous roadside perception datasets have been introduced to propel\nadvancements in autonomous driving and intelligent transportation systems\nresearch and development. However, it has been observed that the majority of\ntheir concentrates is on urban arterial roads, inadvertently overlooking\nresidential areas such as parks and campuses that exhibit entirely distinct\ncharacteristics. In light of this gap, we propose CORP, which stands as the\nfirst public benchmark dataset tailored for multi-modal roadside perception\ntasks under campus scenarios. Collected in a university campus, CORP consists\nof over 205k images plus 102k point clouds captured from 18 cameras and 9 LiDAR\nsensors. These sensors with different configurations are mounted on roadside\nutility poles to provide diverse viewpoints within the campus region. The\nannotations of CORP encompass multi-dimensional information beyond 2D and 3D\nbounding boxes, providing extra support for 3D seamless tracking and instance\nsegmentation with unique IDs and pixel masks for identifying targets, to\nenhance the understanding of objects and their behaviors distributed across the\ncampus premises. Unlike other roadside datasets about urban traffic, CORP\nextends the spectrum to highlight the challenges for multi-modal perception in\ncampuses and other residential areas.\n","authors":["Beibei Wang","Lu Zhang","Shuang Meng","Chenjie Wang","Jingjing Huang","Yao Li","Haojie Ren","Yuxuan Xiao","Yuru Peng","Jianmin Ji","Yu Zhang","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.03191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03190v1","updated":"2024-04-04T04:22:25Z","published":"2024-04-04T04:22:25Z","title":"Adaptive Discrete Disparity Volume for Self-supervised Monocular Depth\n Estimation","summary":" In self-supervised monocular depth estimation tasks, discrete disparity\nprediction has been proven to attain higher quality depth maps than common\ncontinuous methods. However, current discretization strategies often divide\ndepth ranges of scenes into bins in a handcrafted and rigid manner, limiting\nmodel performance. In this paper, we propose a learnable module, Adaptive\nDiscrete Disparity Volume (ADDV), which is capable of dynamically sensing depth\ndistributions in different RGB images and generating adaptive bins for them.\nWithout any extra supervision, this module can be integrated into existing CNN\narchitectures, allowing networks to produce representative values for bins and\na probability volume over them. Furthermore, we introduce novel training\nstrategies - uniformizing and sharpening - through a loss term and temperature\nparameter, respectively, to provide regularizations under self-supervised\nconditions, preventing model degradation or collapse. Empirical results\ndemonstrate that ADDV effectively processes global information, generating\nappropriate bins for various scenes and producing higher quality depth maps\ncompared to handcrafted methods.\n","authors":["Jianwei Ren"],"pdf_url":"https://arxiv.org/pdf/2404.03190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03188v1","updated":"2024-04-04T04:16:31Z","published":"2024-04-04T04:16:31Z","title":"Classification of Nasopharyngeal Cases using DenseNet Deep Learning\n Architecture","summary":" Nasopharyngeal carcinoma (NPC) is one of the understudied yet deadliest\ncancers in South East Asia. In Malaysia, the prevalence is identified mainly in\nSarawak, among the ethnic of Bidayuh. NPC is often late-diagnosed because it is\nasymptomatic at the early stage. There are several tissue representations from\nthe nasopharynx biopsy, such as nasopharyngeal inflammation (NPI), lymphoid\nhyperplasia (LHP), nasopharyngeal carcinoma (NPC) and normal tissue. This paper\nis our first initiative to identify the difference between NPC, NPI and normal\ncases. Seven whole slide images (WSIs) with gigapixel resolutions from seven\ndifferent patients and two hospitals were experimented with using two test\nsetups, consisting of a different set of images. The tissue regions are patched\ninto smaller blocks and classified using DenseNet architecture with 21 dense\nlayers. Two tests are carried out, each for proof of concept (Test 1) and\nreal-test scenario (Test 2). The accuracy achieved for NPC class is 94.8% for\nTest 1 and 67.0% for Test 2.\n","authors":["W. S. H. M. W. Ahmad","M. F. A. Fauzi","M. K. Abdullahi","Jenny T. H. Lee","N. S. A. Basry","A Yahaya","A. M. Ismail","A. Adam","Elaine W. L. Chan","F. S. Abas"],"pdf_url":"https://arxiv.org/pdf/2404.03188v1.pdf","comment":"This article has been accepted in the Journal of Engineering Science\n and Technology (JESTEC) and awaiting publication"},{"id":"http://arxiv.org/abs/2404.03187v1","updated":"2024-04-04T04:12:30Z","published":"2024-04-04T04:12:30Z","title":"AGL-NET: Aerial-Ground Cross-Modal Global Localization with Varying\n Scales","summary":" We present AGL-NET, a novel learning-based method for global localization\nusing LiDAR point clouds and satellite maps. AGL-NET tackles two critical\nchallenges: bridging the representation gap between image and points modalities\nfor robust feature matching, and handling inherent scale discrepancies between\nglobal view and local view. To address these challenges, AGL-NET leverages a\nunified network architecture with a novel two-stage matching design. The first\nstage extracts informative neural features directly from raw sensor data and\nperforms initial feature matching. The second stage refines this matching\nprocess by extracting informative skeleton features and incorporating a novel\nscale alignment step to rectify scale variations between LiDAR and map data.\nFurthermore, a novel scale and skeleton loss function guides the network toward\nlearning scale-invariant feature representations, eliminating the need for\npre-processing satellite maps. This significantly improves real-world\napplicability in scenarios with unknown map scales. To facilitate rigorous\nperformance evaluation, we introduce a meticulously designed dataset within the\nCARLA simulator specifically tailored for metric localization training and\nassessment. The code and dataset will be made publicly available.\n","authors":["Tianrui Guan","Ruiqi Xian","Xijun Wang","Xiyang Wu","Mohamed Elnoor","Daeun Song","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2404.03187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12301v3","updated":"2024-04-04T04:11:05Z","published":"2023-07-23T11:50:27Z","title":"Image Outlier Detection Without Training using RANSAC","summary":" Image outlier detection (OD) is an essential tool to ensure the quality of\nimages used in computer vision tasks. Existing algorithms often involve\ntraining a model to represent the inlier distribution, and outliers are\ndetermined by some deviation measure. Although existing methods proved\neffective when trained on strictly inlier samples, their performance remains\nquestionable when undesired outliers are included during training. As a result\nof this limitation, it is necessary to carefully examine the data when\ndeveloping OD models for new domains. In this work, we present a novel image OD\nalgorithm called RANSAC-NN that eliminates the need of data examination and\nmodel training altogether. Unlike existing approaches, RANSAC-NN can be\ndirectly applied on datasets containing outliers by sampling and comparing\nsubsets of the data. Our algorithm maintains favorable performance compared to\nexisting methods on a range of benchmarks. Furthermore, we show that RANSAC-NN\ncan enhance the robustness of existing methods by incorporating our algorithm\nas part of the data preparation process.\n","authors":["Chen-Han Tsai","Yu-Shao Peng"],"pdf_url":"https://arxiv.org/pdf/2307.12301v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06647v3","updated":"2024-04-04T04:07:48Z","published":"2023-07-13T09:23:21Z","title":"DeepIPCv2: LiDAR-powered Robust Environmental Perception and\n Navigational Control for Autonomous Vehicle","summary":" We present DeepIPCv2, an autonomous driving model that perceives the\nenvironment using a LiDAR sensor for more robust drivability, especially when\ndriving under poor illumination conditions where everything is not clearly\nvisible. DeepIPCv2 takes a set of LiDAR point clouds as the main perception\ninput. Since point clouds are not affected by illumination changes, they can\nprovide a clear observation of the surroundings no matter what the condition\nis. This results in a better scene understanding and stable features provided\nby the perception module to support the controller module in estimating\nnavigational control properly. To evaluate its performance, we conduct several\ntests by deploying the model to predict a set of driving records and perform\nreal automated driving under three different conditions. We also conduct\nablation and comparative studies with some recent models to justify its\nperformance. Based on the experimental results, DeepIPCv2 shows a robust\nperformance by achieving the best drivability in all driving scenarios.\nFurthermore, to support future research, we will upload the codes and data to\nhttps://github.com/oskarnatan/DeepIPCv2.\n","authors":["Oskar Natan","Jun Miura"],"pdf_url":"https://arxiv.org/pdf/2307.06647v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03183v1","updated":"2024-04-04T03:45:17Z","published":"2024-04-04T03:45:17Z","title":"BodyMAP -- Jointly Predicting Body Mesh and 3D Applied Pressure Map for\n People in Bed","summary":" Accurately predicting the 3D human posture and the pressure exerted on the\nbody for people resting in bed, visualized as a body mesh (3D pose & shape)\nwith a 3D pressure map, holds significant promise for healthcare applications,\nparticularly, in the prevention of pressure ulcers. Current methods focus on\nsingular facets of the problem -- predicting only 2D/3D poses, generating 2D\npressure images, predicting pressure only for certain body regions instead of\nthe full body, or forming indirect approximations to the 3D pressure map. In\ncontrast, we introduce BodyMAP, which jointly predicts the human body mesh and\n3D applied pressure map across the entire human body. Our network leverages\nmultiple visual modalities, incorporating both a depth image of a person in bed\nand its corresponding 2D pressure image acquired from a pressure-sensing\nmattress. The 3D pressure map is represented as a pressure value at each mesh\nvertex and thus allows for precise localization of high-pressure regions on the\nbody. Additionally, we present BodyMAP-WS, a new formulation of pressure\nprediction in which we implicitly learn pressure in 3D by aligning sensed 2D\npressure images with a differentiable 2D projection of the predicted 3D\npressure maps. In evaluations with real-world human data, our method\noutperforms the current state-of-the-art technique by 25% on both body mesh and\n3D applied pressure map prediction tasks for people in bed.\n","authors":["Abhishek Tandon","Anujraaj Goyal","Henry M. Clever","Zackory Erickson"],"pdf_url":"https://arxiv.org/pdf/2404.03183v1.pdf","comment":"Accepted at CVPR 2024 Project Website: https://bodymap3d.github.io/\n Code: https://github.com/RCHI-Lab/BodyMAP"},{"id":"http://arxiv.org/abs/2404.03181v1","updated":"2024-04-04T03:30:49Z","published":"2024-04-04T03:30:49Z","title":"MonoCD: Monocular 3D Object Detection with Complementary Depths","summary":" Monocular 3D object detection has attracted widespread attention due to its\npotential to accurately obtain object 3D localization from a single image at a\nlow cost. Depth estimation is an essential but challenging subtask of monocular\n3D object detection due to the ill-posedness of 2D to 3D mapping. Many methods\nexplore multiple local depth clues such as object heights and keypoints and\nthen formulate the object depth estimation as an ensemble of multiple depth\npredictions to mitigate the insufficiency of single-depth information. However,\nthe errors of existing multiple depths tend to have the same sign, which\nhinders them from neutralizing each other and limits the overall accuracy of\ncombined depth. To alleviate this problem, we propose to increase the\ncomplementarity of depths with two novel designs. First, we add a new depth\nprediction branch named complementary depth that utilizes global and efficient\ndepth clues from the entire image rather than the local clues to reduce the\ncorrelation of depth predictions. Second, we propose to fully exploit the\ngeometric relations between multiple depth clues to achieve complementarity in\nform. Benefiting from these designs, our method achieves higher\ncomplementarity. Experiments on the KITTI benchmark demonstrate that our method\nachieves state-of-the-art performance without introducing extra data. In\naddition, complementary depth can also be a lightweight and plug-and-play\nmodule to boost multiple existing monocular 3d object detectors. Code is\navailable at https://github.com/elvintanhust/MonoCD.\n","authors":["Longfei Yan","Pei Yan","Shengzhou Xiong","Xuanyu Xiang","Yihua Tan"],"pdf_url":"https://arxiv.org/pdf/2404.03181v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03179v1","updated":"2024-04-04T03:28:57Z","published":"2024-04-04T03:28:57Z","title":"UniAV: Unified Audio-Visual Perception for Multi-Task Video Localization","summary":" Video localization tasks aim to temporally locate specific instances in\nvideos, including temporal action localization (TAL), sound event detection\n(SED) and audio-visual event localization (AVEL). Existing methods\nover-specialize on each task, overlooking the fact that these instances often\noccur in the same video to form the complete video content. In this work, we\npresent UniAV, a Unified Audio-Visual perception network, to achieve joint\nlearning of TAL, SED and AVEL tasks for the first time. UniAV can leverage\ndiverse data available in task-specific datasets, allowing the model to learn\nand share mutually beneficial knowledge across tasks and modalities. To tackle\nthe challenges posed by substantial variations in datasets\n(size/domain/duration) and distinct task characteristics, we propose to\nuniformly encode visual and audio modalities of all videos to derive generic\nrepresentations, while also designing task-specific experts to capture unique\nknowledge for each task. Besides, we develop a unified language-aware\nclassifier by utilizing a pre-trained text encoder, enabling the model to\nflexibly detect various types of instances and previously unseen ones by simply\nchanging prompts during inference. UniAV outperforms its single-task\ncounterparts by a large margin with fewer parameters, achieving on-par or\nsuperior performances compared to state-of-the-art task-specific methods across\nActivityNet 1.3, DESED and UnAV-100 benchmarks.\n","authors":["Tiantian Geng","Teng Wang","Yanfu Zhang","Jinming Duan","Weili Guan","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.03179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02232v2","updated":"2024-04-04T03:10:58Z","published":"2023-12-04T06:37:11Z","title":"HumanNeRF-SE: A Simple yet Effective Approach to Animate HumanNeRF with\n Diverse Poses","summary":" We present HumanNeRF-SE, a simple yet effective method that synthesizes\ndiverse novel pose images with simple input. Previous HumanNeRF works require a\nlarge number of optimizable parameters to fit the human images. Instead, we\nreload these approaches by combining explicit and implicit human\nrepresentations to design both generalized rigid deformation and specific\nnon-rigid deformation. Our key insight is that explicit shape can reduce the\nsampling points used to fit implicit representation, and frozen blending\nweights from SMPL constructing a generalized rigid deformation can effectively\navoid overfitting and improve pose generalization performance. Our architecture\ninvolving both explicit and implicit representation is simple yet effective.\nExperiments demonstrate our model can synthesize images under arbitrary poses\nwith few-shot input and increase the speed of synthesizing images by 15 times\nthrough a reduction in computational complexity without using any existing\nacceleration modules. Compared to the state-of-the-art HumanNeRF studies,\nHumanNeRF-SE achieves better performance with fewer learnable parameters and\nless training time.\n","authors":["Caoyuan Ma","Yu-Lun Liu","Zhixiang Wang","Wu Liu","Xinchen Liu","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02232v2.pdf","comment":"16pages, 17 figures, 10 tables"},{"id":"http://arxiv.org/abs/2404.02405v2","updated":"2024-04-04T02:56:00Z","published":"2024-04-03T02:16:30Z","title":"TE-TAD: Towards Full End-to-End Temporal Action Detection via\n Time-Aligned Coordinate Expression","summary":" In this paper, we investigate that the normalized coordinate expression is a\nkey factor as reliance on hand-crafted components in query-based detectors for\ntemporal action detection (TAD). Despite significant advancements towards an\nend-to-end framework in object detection, query-based detectors have been\nlimited in achieving full end-to-end modeling in TAD. To address this issue, we\npropose \\modelname{}, a full end-to-end temporal action detection transformer\nthat integrates time-aligned coordinate expression. We reformulate coordinate\nexpression utilizing actual timeline values, ensuring length-invariant\nrepresentations from the extremely diverse video duration environment.\nFurthermore, our proposed adaptive query selection dynamically adjusts the\nnumber of queries based on video length, providing a suitable solution for\nvarying video durations compared to a fixed query set. Our approach not only\nsimplifies the TAD process by eliminating the need for hand-crafted components\nbut also significantly improves the performance of query-based detectors. Our\nTE-TAD outperforms the previous query-based detectors and achieves competitive\nperformance compared to state-of-the-art methods on popular benchmark datasets.\nCode is available at: https://github.com/Dotori-HJ/TE-TAD\n","authors":["Ho-Joong Kim","Jung-Ho Hong","Heejo Kong","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2404.02405v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.13187v2","updated":"2024-04-04T02:47:09Z","published":"2020-10-25T18:51:15Z","title":"Improving the Reconstruction of Disentangled Representation Learners via\n Multi-Stage Modeling","summary":" Current autoencoder-based disentangled representation learning methods\nachieve disentanglement by penalizing the (aggregate) posterior to encourage\nstatistical independence of the latent factors. This approach introduces a\ntrade-off between disentangled representation learning and reconstruction\nquality since the model does not have enough capacity to learn correlated\nlatent variables that capture detail information present in most image data. To\novercome this trade-off, we present a novel multi-stage modeling approach where\nthe disentangled factors are first learned using a penalty-based disentangled\nrepresentation learning method; then, the low-quality reconstruction is\nimproved with another deep generative model that is trained to model the\nmissing correlated latent variables, adding detail information while\nmaintaining conditioning on the previously learned disentangled factors. Taken\ntogether, our multi-stage modelling approach results in a single, coherent\nprobabilistic model that is theoretically justified by the principal of\nD-separation and can be realized with a variety of model classes including\nlikelihood-based models such as variational autoencoders, implicit models such\nas generative adversarial networks, and tractable models like normalizing flows\nor mixtures of Gaussians. We demonstrate that our multi-stage model has higher\nreconstruction quality than current state-of-the-art methods with equivalent\ndisentanglement performance across multiple standard benchmarks. In addition,\nwe apply the multi-stage model to generate synthetic tabular datasets,\nshowcasing an enhanced performance over benchmark models across a variety of\nmetrics. The interpretability analysis further indicates that the multi-stage\nmodel can effectively uncover distinct and meaningful features of variations\nfrom which the original distribution can be recovered.\n","authors":["Akash Srivastava","Yamini Bansal","Yukun Ding","Cole Lincoln Hurwitz","Kai Xu","Bernhard Egger","Prasanna Sattigeri","Joshua B. Tenenbaum","Phuong Le","Arun Prakash R","Nengfeng Zhou","Joel Vaughan","Yaquan Wang","Anwesha Bhattacharyya","Kristjan Greenewald","David D. Cox","Dan Gutfreund"],"pdf_url":"https://arxiv.org/pdf/2010.13187v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13739v2","updated":"2024-04-04T02:36:44Z","published":"2023-03-24T01:46:25Z","title":"WM-MoE: Weather-aware Multi-scale Mixture-of-Experts for Blind Adverse\n Weather Removal","summary":" Adverse weather removal tasks like deraining, desnowing, and dehazing are\nusually treated as separate tasks. However, in practical autonomous driving\nscenarios, the type, intensity,and mixing degree of weather are unknown, so\nhandling each task separately cannot deal with the complex practical scenarios.\nIn this paper, we study the blind adverse weather removal problem.\nMixture-of-Experts (MoE) is a popular model that adopts a learnable gate to\nroute the input to different expert networks. The principle of MoE involves\nusing adaptive networks to process different types of unknown inputs.\nTherefore, MoE has great potential for blind adverse weather removal. However,\nthe original MoE module is inadequate for coupled multiple weather types and\nfails to utilize multi-scale features for better performance. To this end, we\npropose a method called Weather-aware Multi-scale MoE (WM-MoE) based on\nTransformer for blind weather removal. WM-MoE includes two key designs:\nWEather-Aware Router (WEAR) and Multi-Scale Experts (MSE). WEAR assigns experts\nfor each image token based on decoupled content and weather features, which\nenhances the model's capability to process multiple adverse weathers. To obtain\ndiscriminative weather features from images, we propose Weather Guidance\nFine-grained Contrastive Learning (WGF-CL), which utilizes weather cluster\ninformation to guide the assignment of positive and negative samples for each\nimage token. Since processing different weather types requires different\nreceptive fields, MSE leverages multi-scale features to enhance the spatial\nrelationship modeling capability, facilitating the high-quality restoration of\ndiverse weather types and intensities. Our method achieves state-of-the-art\nperformance in blind adverse weather removal on two public datasets and our\ndataset. We also demonstrate the advantage of our method on downstream\nsegmentation tasks.\n","authors":["Yulin Luo","Rui Zhao","Xiaobao Wei","Jinwei Chen","Yijie Lu","Shenghao Xie","Tianyu Wang","Ruiqin Xiong","Ming Lu","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.13739v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03161v1","updated":"2024-04-04T02:22:37Z","published":"2024-04-04T02:22:37Z","title":"BioVL-QR: Egocentric Biochemical Video-and-Language Dataset Using Micro\n QR Codes","summary":" This paper introduces a biochemical vision-and-language dataset, which\nconsists of 24 egocentric experiment videos, corresponding protocols, and\nvideo-and-language alignments. The key challenge in the wet-lab domain is\ndetecting equipment, reagents, and containers is difficult because the lab\nenvironment is scattered by filling objects on the table and some objects are\nindistinguishable. Therefore, previous studies assume that objects are manually\nannotated and given for downstream tasks, but this is costly and\ntime-consuming. To address this issue, this study focuses on Micro QR Codes to\ndetect objects automatically. From our preliminary study, we found that\ndetecting objects only using Micro QR Codes is still difficult because the\nresearchers manipulate objects, causing blur and occlusion frequently. To\naddress this, we also propose a novel object labeling method by combining a\nMicro QR Code detector and an off-the-shelf hand object detector. As one of the\napplications of our dataset, we conduct the task of generating protocols from\nexperiment videos and find that our approach can generate accurate protocols.\n","authors":["Taichi Nishimura","Koki Yamamoto","Yuto Haneji","Keiya Kajimura","Chihiro Nishiwaki","Eriko Daikoku","Natsuko Okuda","Fumihito Ono","Hirotaka Kameko","Shinsuke Mori"],"pdf_url":"https://arxiv.org/pdf/2404.03161v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2404.03159v1","updated":"2024-04-04T02:15:16Z","published":"2024-04-04T02:15:16Z","title":"HandDiff: 3D Hand Pose Estimation with Diffusion on Image-Point Cloud","summary":" Extracting keypoint locations from input hand frames, known as 3D hand pose\nestimation, is a critical task in various human-computer interaction\napplications. Essentially, the 3D hand pose estimation can be regarded as a 3D\npoint subset generative problem conditioned on input frames. Thanks to the\nrecent significant progress on diffusion-based generative models, hand pose\nestimation can also benefit from the diffusion model to estimate keypoint\nlocations with high quality. However, directly deploying the existing diffusion\nmodels to solve hand pose estimation is non-trivial, since they cannot achieve\nthe complex permutation mapping and precise localization. Based on this\nmotivation, this paper proposes HandDiff, a diffusion-based hand pose\nestimation model that iteratively denoises accurate hand pose conditioned on\nhand-shaped image-point clouds. In order to recover keypoint permutation and\naccurate location, we further introduce joint-wise condition and local detail\ncondition. Experimental results demonstrate that the proposed HandDiff\nsignificantly outperforms the existing approaches on four challenging hand pose\nbenchmark datasets. Codes and pre-trained models are publicly available at\nhttps://github.com/cwc1260/HandDiff.\n","authors":["Wencan Cheng","Hao Tang","Luc Van Gool","Jong Hwan Ko"],"pdf_url":"https://arxiv.org/pdf/2404.03159v1.pdf","comment":"Accepted as a conference paper to the Conference on Computer Vision\n and Pattern Recognition (2024)"},{"id":"http://arxiv.org/abs/2404.01518v2","updated":"2024-04-04T02:06:15Z","published":"2024-04-01T22:53:47Z","title":"Temporally Consistent Unbalanced Optimal Transport for Unsupervised\n Action Segmentation","summary":" We propose a novel approach to the action segmentation task for long,\nuntrimmed videos, based on solving an optimal transport problem. By encoding a\ntemporal consistency prior into a Gromov-Wasserstein problem, we are able to\ndecode a temporally consistent segmentation from a noisy affinity/matching cost\nmatrix between video frames and action classes. Unlike previous approaches, our\nmethod does not require knowing the action order for a video to attain temporal\nconsistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can\nbe efficiently solved on GPUs using a few iterations of projected mirror\ndescent. We demonstrate the effectiveness of our method in an unsupervised\nlearning setting, where our method is used to generate pseudo-labels for\nself-training. We evaluate our segmentation approach and unsupervised learning\npipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly\ndatasets, yielding state-of-the-art results for the unsupervised video action\nsegmentation task.\n","authors":["Ming Xu","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2404.01518v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03145v1","updated":"2024-04-04T01:39:01Z","published":"2024-04-04T01:39:01Z","title":"DreamWalk: Style Space Exploration using Diffusion Guidance","summary":" Text-conditioned diffusion models can generate impressive images, but fall\nshort when it comes to fine-grained control. Unlike direct-editing tools like\nPhotoshop, text conditioned models require the artist to perform \"prompt\nengineering,\" constructing special text sentences to control the style or\namount of a particular subject present in the output image. Our goal is to\nprovide fine-grained control over the style and substance specified by the\nprompt, for example to adjust the intensity of styles in different regions of\nthe image (Figure 1). Our approach is to decompose the text prompt into\nconceptual elements, and apply a separate guidance term for each element in a\nsingle diffusion process. We introduce guidance scale functions to control when\nin the diffusion process and \\emph{where} in the image to intervene. Since the\nmethod is based solely on adjusting diffusion guidance, it does not require\nfine-tuning or manipulating the internal layers of the diffusion model's neural\nnetwork, and can be used in conjunction with LoRA- or DreamBooth-trained models\n(Figure2). Project page: https://mshu1.github.io/dreamwalk.github.io/\n","authors":["Michelle Shu","Charles Herrmann","Richard Strong Bowen","Forrester Cole","Ramin Zabih"],"pdf_url":"https://arxiv.org/pdf/2404.03145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03144v1","updated":"2024-04-04T01:34:36Z","published":"2024-04-04T01:34:36Z","title":"Diverse and Tailored Image Generation for Zero-shot Multi-label\n Classification","summary":" Recently, zero-shot multi-label classification has garnered considerable\nattention for its capacity to operate predictions on unseen labels without\nhuman annotations. Nevertheless, prevailing approaches often use seen classes\nas imperfect proxies for unseen ones, resulting in suboptimal performance.\nDrawing inspiration from the success of text-to-image generation models in\nproducing realistic images, we propose an innovative solution: generating\nsynthetic data to construct a training set explicitly tailored for proxyless\ntraining on unseen labels. Our approach introduces a novel image generation\nframework that produces multi-label synthetic images of unseen classes for\nclassifier training. To enhance diversity in the generated images, we leverage\na pre-trained large language model to generate diverse prompts. Employing a\npre-trained multi-modal CLIP model as a discriminator, we assess whether the\ngenerated images accurately represent the target classes. This enables\nautomatic filtering of inaccurately generated images, preserving classifier\naccuracy. To refine text prompts for more precise and effective multi-label\nobject generation, we introduce a CLIP score-based discriminative loss to\nfine-tune the text encoder in the diffusion model. Additionally, to enhance\nvisual features on the target task while maintaining the generalization of\noriginal features and mitigating catastrophic forgetting resulting from\nfine-tuning the entire visual encoder, we propose a feature fusion module\ninspired by transformer attention mechanisms. This module aids in capturing\nglobal dependencies between multiple objects more effectively. Extensive\nexperimental results validate the effectiveness of our approach, demonstrating\nsignificant improvements over state-of-the-art methods.\n","authors":["Kaixin Zhang","Zhixiang Yuan","Tao Huang"],"pdf_url":"https://arxiv.org/pdf/2404.03144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03138v1","updated":"2024-04-04T01:22:23Z","published":"2024-04-04T01:22:23Z","title":"Discontinuity-preserving Normal Integration with Auxiliary Edges","summary":" Many surface reconstruction methods incorporate normal integration, which is\na process to obtain a depth map from surface gradients. In this process, the\ninput may represent a surface with discontinuities, e.g., due to\nself-occlusion. To reconstruct an accurate depth map from the input normal map,\nhidden surface gradients occurring from the jumps must be handled. To model\nthese jumps correctly, we design a novel discretization scheme for the domain\nof normal integration. Our key idea is to introduce auxiliary edges, which\nbridge between piecewise-smooth patches in the domain so that the magnitude of\nhidden jumps can be explicitly expressed. Using the auxiliary edges, we design\na novel algorithm to optimize the discontinuity and the depth map from the\ninput normal map. Our method optimizes discontinuities by using a combination\nof iterative re-weighted least squares and iterative filtering of the jump\nmagnitudes on auxiliary edges to provide strong sparsity regularization.\nCompared to previous discontinuity-preserving normal integration methods, which\nmodel the magnitudes of jumps only implicitly, our method reconstructs subtle\ndiscontinuities accurately thanks to our explicit representation of jumps\nallowing for strong sparsity regularization.\n","authors":["Hyomin Kim","Yucheol Jung","Seungyong Lee"],"pdf_url":"https://arxiv.org/pdf/2404.03138v1.pdf","comment":"To appear at CVPR 2024. For supplementary video, see\n https://youtu.be/MTTcW5kAOFE"},{"id":"http://arxiv.org/abs/2404.02072v2","updated":"2024-04-04T00:59:51Z","published":"2024-04-02T16:20:02Z","title":"EGTR: Extracting Graph from Transformer for Scene Graph Generation","summary":" Scene Graph Generation (SGG) is a challenging task of detecting objects and\npredicting relationships between objects. After DETR was developed, one-stage\nSGG models based on a one-stage object detector have been actively studied.\nHowever, complex modeling is used to predict the relationship between objects,\nand the inherent relationship between object queries learned in the multi-head\nself-attention of the object detector has been neglected. We propose a\nlightweight one-stage SGG model that extracts the relation graph from the\nvarious relationships learned in the multi-head self-attention layers of the\nDETR decoder. By fully utilizing the self-attention by-products, the relation\ngraph can be extracted effectively with a shallow relation extraction head.\nConsidering the dependency of the relation extraction task on the object\ndetection task, we propose a novel relation smoothing technique that adjusts\nthe relation label adaptively according to the quality of the detected objects.\nBy the relation smoothing, the model is trained according to the continuous\ncurriculum that focuses on object detection task at the beginning of training\nand performs multi-task learning as the object detection performance gradually\nimproves. Furthermore, we propose a connectivity prediction task that predicts\nwhether a relation exists between object pairs as an auxiliary task of the\nrelation extraction. We demonstrate the effectiveness and efficiency of our\nmethod for the Visual Genome and Open Image V6 datasets. Our code is publicly\navailable at https://github.com/naver-ai/egtr.\n","authors":["Jinbae Im","JeongYeon Nam","Nokyung Park","Hyungmin Lee","Seunghyun Park"],"pdf_url":"https://arxiv.org/pdf/2404.02072v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03126v1","updated":"2024-04-04T00:28:50Z","published":"2024-04-04T00:28:50Z","title":"GaSpCT: Gaussian Splatting for Novel CT Projection View Synthesis","summary":" We present GaSpCT, a novel view synthesis and 3D scene representation method\nused to generate novel projection views for Computer Tomography (CT) scans. We\nadapt the Gaussian Splatting framework to enable novel view synthesis in CT\nbased on limited sets of 2D image projections and without the need for\nStructure from Motion (SfM) methodologies. Therefore, we reduce the total\nscanning duration and the amount of radiation dose the patient receives during\nthe scan. We adapted the loss function to our use-case by encouraging a\nstronger background and foreground distinction using two sparsity promoting\nregularizers: a beta loss and a total variation (TV) loss. Finally, we\ninitialize the Gaussian locations across the 3D space using a uniform prior\ndistribution of where the brain's positioning would be expected to be within\nthe field of view. We evaluate the performance of our model using brain CT\nscans from the Parkinson's Progression Markers Initiative (PPMI) dataset and\ndemonstrate that the rendered novel views closely match the original projection\nviews of the simulated scan, and have better performance than other implicit 3D\nscene representations methodologies. Furthermore, we empirically observe\nreduced training time compared to neural network based image synthesis for\nsparse-view CT image reconstruction. Finally, the memory requirements of the\nGaussian Splatting representations are reduced by 17% compared to the\nequivalent voxel grid image representations.\n","authors":["Emmanouil Nikolakakis","Utkarsh Gupta","Jonathan Vengosh","Justin Bui","Razvan Marinescu"],"pdf_url":"https://arxiv.org/pdf/2404.03126v1.pdf","comment":"Under Review Process for MICCAI 2024"},{"id":"http://arxiv.org/abs/2203.13856v2","updated":"2024-04-04T00:13:42Z","published":"2022-03-25T18:42:20Z","title":"Robust deep learning for eye fundus images: Bridging real and synthetic\n data for enhancing generalization","summary":" Deep learning applications for assessing medical images are limited because\nthe datasets are often small and imbalanced. The use of synthetic data has been\nproposed in the literature, but neither a robust comparison of the different\nmethods nor generalizability has been reported. Our approach integrates a\nretinal image quality assessment model and StyleGAN2 architecture to enhance\nAge-related Macular Degeneration (AMD) detection capabilities and improve\ngeneralizability. This work compares ten different Generative Adversarial\nNetwork (GAN) architectures to generate synthetic eye-fundus images with and\nwithout AMD. We combined subsets of three public databases (iChallenge-AMD,\nODIR-2019, and RIADD) to form a single training and test set. We employed the\nSTARE dataset for external validation, ensuring a comprehensive assessment of\nthe proposed approach. The results show that StyleGAN2 reached the lowest\nFrechet Inception Distance (166.17), and clinicians could not accurately\ndifferentiate between real and synthetic images. ResNet-18 architecture\nobtained the best performance with 85% accuracy and outperformed the two human\nexperts (80% and 75%) in detecting AMD fundus images. The accuracy rates were\n82.8% for the test set and 81.3% for the STARE dataset, demonstrating the\nmodel's generalizability. The proposed methodology for synthetic medical image\ngeneration has been validated for robustness and accuracy, with free access to\nits code for further research and development in this field.\n","authors":["Guilherme C. Oliveira","Gustavo H. Rosa","Daniel C. G. Pedronette","João P. Papa","Himeesh Kumar","Leandro A. Passos","Dinesh Kumar"],"pdf_url":"https://arxiv.org/pdf/2203.13856v2.pdf","comment":"Accepted by the Biomedical Signal Processing and Control"},{"id":"http://arxiv.org/abs/2009.04650v2","updated":"2024-04-04T15:25:22Z","published":"2020-09-10T02:55:27Z","title":"Towards Fine-grained Large Object Segmentation 1st Place Solution to 3D\n AI Challenge 2020 -- Instance Segmentation Track","summary":" This technical report introduces our solutions of Team 'FineGrainedSeg' for\nInstance Segmentation track in 3D AI Challenge 2020. In order to handle\nextremely large objects in 3D-FUTURE, we adopt PointRend as our basic\nframework, which outputs more fine-grained masks compared to HTC and SOLOv2.\nOur final submission is an ensemble of 5 PointRend models, which achieves the\n1st place on both validation and test leaderboards. The code is available at\nhttps://github.com/zehuichen123/3DFuture_ins_seg.\n","authors":["Zehui Chen","Qiaofei Li","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2009.04650v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/1902.11122v5","updated":"2024-04-04T11:34:52Z","published":"2019-02-22T10:09:11Z","title":"Deep Learning in Cardiology","summary":" The medical field is creating large amount of data that physicians are unable\nto decipher and use efficiently. Moreover, rule-based expert systems are\ninefficient in solving complicated medical tasks or for creating insights using\nbig data. Deep learning has emerged as a more accurate and effective technology\nin a wide range of medical problems such as diagnosis, prediction and\nintervention. Deep learning is a representation learning method that consists\nof layers that transform the data non-linearly, thus, revealing hierarchical\nrelationships and structures. In this review we survey deep learning\napplication papers that use structured data, signal and imaging modalities from\ncardiology. We discuss the advantages and limitations of applying deep learning\nin cardiology that also apply in medicine in general, while proposing certain\ndirections as the most viable for clinical use.\n","authors":["Paschalis Bizopoulos","Dimitrios Koutsouris"],"pdf_url":"https://arxiv.org/pdf/1902.11122v5.pdf","comment":"27 pages, 2 figures, 10 tables"},{"id":"http://arxiv.org/abs/2404.03836v1","updated":"2024-04-04T23:38:45Z","published":"2024-04-04T23:38:45Z","title":"PARIS3D: Reasoning-based 3D Part Segmentation Using Large Multimodal\n Model","summary":" Recent advancements in 3D perception systems have significantly improved\ntheir ability to perform visual recognition tasks such as segmentation.\nHowever, these systems still heavily rely on explicit human instruction to\nidentify target objects or categories, lacking the capability to actively\nreason and comprehend implicit user intentions. We introduce a novel\nsegmentation task known as reasoning part segmentation for 3D objects, aiming\nto output a segmentation mask based on complex and implicit textual queries\nabout specific parts of a 3D object. To facilitate evaluation and benchmarking,\nwe present a large 3D dataset comprising over 60k instructions paired with\ncorresponding ground-truth part segmentation annotations specifically curated\nfor reasoning-based 3D part segmentation. We propose a model that is capable of\nsegmenting parts of 3D objects based on implicit textual queries and generating\nnatural language explanations corresponding to 3D object segmentation requests.\nExperiments show that our method achieves competitive performance to models\nthat use explicit queries, with the additional abilities to identify part\nconcepts, reason about them, and complement them with world knowledge. Our\nsource code, dataset, and trained models are available at\nhttps://github.com/AmrinKareem/PARIS3D.\n","authors":["Amrin Kareem","Jean Lahoud","Hisham Cholakkal"],"pdf_url":"https://arxiv.org/pdf/2404.03836v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2404.03831v1","updated":"2024-04-04T23:24:14Z","published":"2024-04-04T23:24:14Z","title":"SleepVST: Sleep Staging from Near-Infrared Video Signals using\n Pre-Trained Transformers","summary":" Advances in camera-based physiological monitoring have enabled the robust,\nnon-contact measurement of respiration and the cardiac pulse, which are known\nto be indicative of the sleep stage. This has led to research into camera-based\nsleep monitoring as a promising alternative to \"gold-standard\" polysomnography,\nwhich is cumbersome, expensive to administer, and hence unsuitable for\nlonger-term clinical studies. In this paper, we introduce SleepVST, a\ntransformer model which enables state-of-the-art performance in camera-based\nsleep stage classification (sleep staging). After pre-training on contact\nsensor data, SleepVST outperforms existing methods for cardio-respiratory sleep\nstaging on the SHHS and MESA datasets, achieving total Cohen's kappa scores of\n0.75 and 0.77 respectively. We then show that SleepVST can be successfully\ntransferred to cardio-respiratory waveforms extracted from video, enabling\nfully contact-free sleep staging. Using a video dataset of 50 nights, we\nachieve a total accuracy of 78.8\\% and a Cohen's $\\kappa$ of 0.71 in four-class\nvideo-based sleep staging, setting a new state-of-the-art in the domain.\n","authors":["Jonathan F. Carter","João Jorge","Oliver Gibson","Lionel Tarassenko"],"pdf_url":"https://arxiv.org/pdf/2404.03831v1.pdf","comment":"CVPR 2024 Highlight Paper"},{"id":"http://arxiv.org/abs/2305.05006v2","updated":"2024-04-04T22:51:42Z","published":"2023-05-08T19:25:50Z","title":"Synthesis of Annotated Colorectal Cancer Tissue Images from Gland Layout","summary":" Generating realistic tissue images with annotations is a challenging task\nthat is important in many computational histopathology applications.\nSynthetically generated images and annotations are valuable for training and\nevaluating algorithms in this domain. To address this, we propose an\ninteractive framework generating pairs of realistic colorectal cancer histology\nimages with corresponding glandular masks from glandular structure layouts. The\nframework accurately captures vital features like stroma, goblet cells, and\nglandular lumen. Users can control gland appearance by adjusting parameters\nsuch as the number of glands, their locations, and sizes. The generated images\nexhibit good Frechet Inception Distance (FID) scores compared to the\nstate-of-the-art image-to-image translation model. Additionally, we demonstrate\nthe utility of our synthetic annotations for evaluating gland segmentation\nalgorithms. Furthermore, we present a methodology for constructing glandular\nmasks using advanced deep generative models, such as latent diffusion models.\nThese masks enable tissue image generation through a residual encoder-decoder\nnetwork.\n","authors":["Srijay Deshpande","Fayyaz Minhas","Nasir Rajpoot"],"pdf_url":"https://arxiv.org/pdf/2305.05006v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16812v2","updated":"2024-04-04T22:31:18Z","published":"2023-12-28T04:14:55Z","title":"Spacetime Gaussian Feature Splatting for Real-Time Dynamic View\n Synthesis","summary":" Novel view synthesis of dynamic scenes has been an intriguing yet challenging\nproblem. Despite recent advancements, simultaneously achieving high-resolution\nphotorealistic results, real-time rendering, and compact storage remains a\nformidable task. To address these challenges, we propose Spacetime Gaussian\nFeature Splatting as a novel dynamic scene representation, composed of three\npivotal components. First, we formulate expressive Spacetime Gaussians by\nenhancing 3D Gaussians with temporal opacity and parametric motion/rotation.\nThis enables Spacetime Gaussians to capture static, dynamic, as well as\ntransient content within a scene. Second, we introduce splatted feature\nrendering, which replaces spherical harmonics with neural features. These\nfeatures facilitate the modeling of view- and time-dependent appearance while\nmaintaining small size. Third, we leverage the guidance of training error and\ncoarse depth to sample new Gaussians in areas that are challenging to converge\nwith existing pipelines. Experiments on several established real-world datasets\ndemonstrate that our method achieves state-of-the-art rendering quality and\nspeed, while retaining compact storage. At 8K resolution, our lite-version\nmodel can render at 60 FPS on an Nvidia RTX 4090 GPU. Our code is available at\nhttps://github.com/oppo-us-research/SpacetimeGaussians.\n","authors":["Zhan Li","Zhang Chen","Zhong Li","Yi Xu"],"pdf_url":"https://arxiv.org/pdf/2312.16812v2.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://oppo-us-research.github.io/SpacetimeGaussians-website/"},{"id":"http://arxiv.org/abs/2404.03819v1","updated":"2024-04-04T22:31:15Z","published":"2024-04-04T22:31:15Z","title":"Effective Lymph Nodes Detection in CT Scans Using Location Debiased\n Query Selection and Contrastive Query Representation in Transformer","summary":" Lymph node (LN) assessment is a critical, indispensable yet very challenging\ntask in the routine clinical workflow of radiology and oncology. Accurate LN\nanalysis is essential for cancer diagnosis, staging, and treatment planning.\nFinding scatteredly distributed, low-contrast clinically relevant LNs in 3D CT\nis difficult even for experienced physicians under high inter-observer\nvariations. Previous automatic LN detection works typically yield limited\nrecall and high false positives (FPs) due to adjacent anatomies with similar\nimage intensities, shapes, or textures (vessels, muscles, esophagus, etc). In\nthis work, we propose a new LN DEtection TRansformer, named LN-DETR, to achieve\nmore accurate performance. By enhancing the 2D backbone with a multi-scale 2.5D\nfeature fusion to incorporate 3D context explicitly, more importantly, we make\ntwo main contributions to improve the representation quality of LN queries. 1)\nConsidering that LN boundaries are often unclear, an IoU prediction head and a\nlocation debiased query selection are proposed to select LN queries of higher\nlocalization accuracy as the decoder query's initialization. 2) To reduce FPs,\nquery contrastive learning is employed to explicitly reinforce LN queries\ntowards their best-matched ground-truth queries over unmatched query\npredictions. Trained and tested on 3D CT scans of 1067 patients (with 10,000+\nlabeled LNs) via combining seven LN datasets from different body parts (neck,\nchest, and abdomen) and pathologies/cancers, our method significantly improves\nthe performance of previous leading methods by > 4-5% average recall at the\nsame FP rates in both internal and external testing. We further evaluate on the\nuniversal lesion detection task using NIH DeepLesion benchmark, and our method\nachieves the top performance of 88.46% averaged recall across 0.5 to 4 FPs per\nimage, compared with other leading reported results.\n","authors":["Qinji Yu","Yirui Wang","Ke Yan","Haoshen Li","Dazhou Guo","Li Zhang","Le Lu","Na Shen","Qifeng Wang","Xiaowei Ding","Xianghua Ye","Dakai Jin"],"pdf_url":"https://arxiv.org/pdf/2404.03819v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2404.03799v1","updated":"2024-04-04T20:42:49Z","published":"2024-04-04T20:42:49Z","title":"Language-Guided Instance-Aware Domain-Adaptive Panoptic Segmentation","summary":" The increasing relevance of panoptic segmentation is tied to the advancements\nin autonomous driving and AR/VR applications. However, the deployment of such\nmodels has been limited due to the expensive nature of dense data annotation,\ngiving rise to unsupervised domain adaptation (UDA). A key challenge in\npanoptic UDA is reducing the domain gap between a labeled source and an\nunlabeled target domain while harmonizing the subtasks of semantic and instance\nsegmentation to limit catastrophic interference. While considerable progress\nhas been achieved, existing approaches mainly focus on the adaptation of\nsemantic segmentation. In this work, we focus on incorporating instance-level\nadaptation via a novel instance-aware cross-domain mixing strategy IMix. IMix\nsignificantly enhances the panoptic quality by improving instance segmentation\nperformance. Specifically, we propose inserting high-confidence predicted\ninstances from the target domain onto source images, retaining the\nexhaustiveness of the resulting pseudo-labels while reducing the injected\nconfirmation bias. Nevertheless, such an enhancement comes at the cost of\ndegraded semantic performance, attributed to catastrophic forgetting. To\nmitigate this issue, we regularize our semantic branch by employing CLIP-based\ndomain alignment (CDA), exploiting the domain-robustness of natural language\nprompts. Finally, we present an end-to-end model incorporating these two\nmechanisms called LIDAPS, achieving state-of-the-art results on all popular\npanoptic UDA benchmarks.\n","authors":["Elham Amin Mansour","Ozan Unal","Suman Saha","Benjamin Bejar","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.03799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03789v1","updated":"2024-04-04T20:04:12Z","published":"2024-04-04T20:04:12Z","title":"Quantifying Uncertainty in Motion Prediction with Variational Bayesian\n Mixture","summary":" Safety and robustness are crucial factors in developing trustworthy\nautonomous vehicles. One essential aspect of addressing these factors is to\nequip vehicles with the capability to predict future trajectories for all\nmoving objects in the surroundings and quantify prediction uncertainties. In\nthis paper, we propose the Sequential Neural Variational Agent (SeNeVA), a\ngenerative model that describes the distribution of future trajectories for a\nsingle moving object. Our approach can distinguish Out-of-Distribution data\nwhile quantifying uncertainty and achieving competitive performance compared to\nstate-of-the-art methods on the Argoverse 2 and INTERACTION datasets.\nSpecifically, a 0.446 meters minimum Final Displacement Error, a 0.203 meters\nminimum Average Displacement Error, and a 5.35% Miss Rate are achieved on the\nINTERACTION test set. Extensive qualitative and quantitative analysis is also\nprovided to evaluate the proposed model. Our open-source code is available at\nhttps://github.com/PurdueDigitalTwin/seneva.\n","authors":["Juanwu Lu","Can Cui","Yunsheng Ma","Aniket Bera","Ziran Wang"],"pdf_url":"https://arxiv.org/pdf/2404.03789v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03784v1","updated":"2024-04-04T19:55:11Z","published":"2024-04-04T19:55:11Z","title":"Layerwise Early Stopping for Test Time Adaptation","summary":" Test Time Adaptation (TTA) addresses the problem of distribution shift by\nenabling pretrained models to learn new features on an unseen domain at test\ntime. However, it poses a significant challenge to maintain a balance between\nlearning new features and retaining useful pretrained features. In this paper,\nwe propose Layerwise EArly STopping (LEAST) for TTA to address this problem.\nThe key idea is to stop adapting individual layers during TTA if the features\nbeing learned do not appear beneficial for the new domain. For that purpose, we\npropose using a novel gradient-based metric to measure the relevance of the\ncurrent learnt features to the new domain without the need for supervised\nlabels. More specifically, we propose to use this metric to determine\ndynamically when to stop updating each layer during TTA. This enables a more\nbalanced adaptation, restricted to layers benefiting from it, and only for a\ncertain number of steps. Such an approach also has the added effect of limiting\nthe forgetting of pretrained features useful for dealing with new domains.\nThrough extensive experiments, we demonstrate that Layerwise Early Stopping\nimproves the performance of existing TTA approaches across multiple datasets,\ndomain shifts, model architectures, and TTA losses.\n","authors":["Sabyasachi Sahoo","Mostafa ElAraby","Jonas Ngnawe","Yann Pequignot","Frederic Precioso","Christian Gagne"],"pdf_url":"https://arxiv.org/pdf/2404.03784v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.03778v1","updated":"2024-04-04T19:50:57Z","published":"2024-04-04T19:50:57Z","title":"Flattening the Parent Bias: Hierarchical Semantic Segmentation in the\n Poincaré Ball","summary":" Hierarchy is a natural representation of semantic taxonomies, including the\nones routinely used in image segmentation. Indeed, recent work on semantic\nsegmentation reports improved accuracy from supervised training leveraging\nhierarchical label structures. Encouraged by these results, we revisit the\nfundamental assumptions behind that work. We postulate and then empirically\nverify that the reasons for the observed improvement in segmentation accuracy\nmay be entirely unrelated to the use of the semantic hierarchy. To demonstrate\nthis, we design a range of cross-domain experiments with a representative\nhierarchical approach. We find that on the new testing domains, a flat\n(non-hierarchical) segmentation network, in which the parents are inferred from\nthe children, has superior segmentation accuracy to the hierarchical approach\nacross the board. Complementing these findings and inspired by the intrinsic\nproperties of hyperbolic spaces, we study a more principled approach to\nhierarchical segmentation using the Poincar\\'e ball model. The hyperbolic\nrepresentation largely outperforms the previous (Euclidean) hierarchical\napproach as well and is on par with our flat Euclidean baseline in terms of\nsegmentation accuracy. However, it additionally exhibits surprisingly strong\ncalibration quality of the parent nodes in the semantic hierarchy, especially\non the more challenging domains. Our combined analysis suggests that the\nestablished practice of hierarchical segmentation may be limited to in-domain\nsettings, whereas flat classifiers generalize substantially better, especially\nif they are modeled in the hyperbolic space.\n","authors":["Simon Weber","Barış Zöngür","Nikita Araslanov","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.03778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02733v2","updated":"2024-04-04T19:42:32Z","published":"2024-04-03T13:34:09Z","title":"InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image\n Generation","summary":" Tuning-free diffusion-based models have demonstrated significant potential in\nthe realm of image personalization and customization. However, despite this\nnotable progress, current models continue to grapple with several complex\nchallenges in producing style-consistent image generation. Firstly, the concept\nof style is inherently underdetermined, encompassing a multitude of elements\nsuch as color, material, atmosphere, design, and structure, among others.\nSecondly, inversion-based methods are prone to style degradation, often\nresulting in the loss of fine-grained details. Lastly, adapter-based approaches\nfrequently require meticulous weight tuning for each reference image to achieve\na balance between style intensity and text controllability. In this paper, we\ncommence by examining several compelling yet frequently overlooked\nobservations. We then proceed to introduce InstantStyle, a framework designed\nto address these issues through the implementation of two key strategies: 1) A\nstraightforward mechanism that decouples style and content from reference\nimages within the feature space, predicated on the assumption that features\nwithin the same space can be either added to or subtracted from one another. 2)\nThe injection of reference image features exclusively into style-specific\nblocks, thereby preventing style leaks and eschewing the need for cumbersome\nweight tuning, which often characterizes more parameter-heavy designs.Our work\ndemonstrates superior visual stylization outcomes, striking an optimal balance\nbetween the intensity of style and the controllability of textual elements. Our\ncodes will be available at https://github.com/InstantStyle/InstantStyle.\n","authors":["Haofan Wang","Matteo Spinelli","Qixun Wang","Xu Bai","Zekui Qin","Anthony Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02733v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2307.00040v3","updated":"2024-04-04T19:41:09Z","published":"2023-06-30T17:37:48Z","title":"DisCo: Disentangled Control for Realistic Human Dance Generation","summary":" Generative AI has made significant strides in computer vision, particularly\nin text-driven image/video synthesis (T2I/T2V). Despite the notable\nadvancements, it remains challenging in human-centric content synthesis such as\nrealistic dance generation. Current methodologies, primarily tailored for human\nmotion transfer, encounter difficulties when confronted with real-world dance\nscenarios (e.g., social media dance), which require to generalize across a wide\nspectrum of poses and intricate human details. In this paper, we depart from\nthe traditional paradigm of human motion transfer and emphasize two additional\ncritical attributes for the synthesis of human dance content in social media\ncontexts: (i) Generalizability: the model should be able to generalize beyond\ngeneric human viewpoints as well as unseen human subjects, backgrounds, and\nposes; (ii) Compositionality: it should allow for the seamless composition of\nseen/unseen subjects, backgrounds, and poses from different sources. To address\nthese challenges, we introduce DISCO, which includes a novel model architecture\nwith disentangled control to improve the compositionality of dance synthesis,\nand an effective human attribute pre-training for better generalizability to\nunseen humans. Extensive qualitative and quantitative results demonstrate that\nDisCc can generate high-quality human dance images and videos with diverse\nappearances and flexible motions. Code is available at\nhttps://disco-dance.github.io/.\n","authors":["Tan Wang","Linjie Li","Kevin Lin","Yuanhao Zhai","Chung-Ching Lin","Zhengyuan Yang","Hanwang Zhang","Zicheng Liu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.00040v3.pdf","comment":"Accepted by CVPR24"},{"id":"http://arxiv.org/abs/2312.12337v4","updated":"2024-04-04T19:04:55Z","published":"2023-12-19T17:03:50Z","title":"pixelSplat: 3D Gaussian Splats from Image Pairs for Scalable\n Generalizable 3D Reconstruction","summary":" We introduce pixelSplat, a feed-forward model that learns to reconstruct 3D\nradiance fields parameterized by 3D Gaussian primitives from pairs of images.\nOur model features real-time and memory-efficient rendering for scalable\ntraining as well as fast 3D reconstruction at inference time. To overcome local\nminima inherent to sparse and locally supported representations, we predict a\ndense probability distribution over 3D and sample Gaussian means from that\nprobability distribution. We make this sampling operation differentiable via a\nreparameterization trick, allowing us to back-propagate gradients through the\nGaussian splatting representation. We benchmark our method on wide-baseline\nnovel view synthesis on the real-world RealEstate10k and ACID datasets, where\nwe outperform state-of-the-art light field transformers and accelerate\nrendering by 2.5 orders of magnitude while reconstructing an interpretable and\neditable 3D radiance field.\n","authors":["David Charatan","Sizhe Li","Andrea Tagliasacchi","Vincent Sitzmann"],"pdf_url":"https://arxiv.org/pdf/2312.12337v4.pdf","comment":"Project page: https://dcharatan.github.io/pixelsplat"},{"id":"http://arxiv.org/abs/2305.07490v6","updated":"2024-04-04T18:55:18Z","published":"2023-05-12T14:04:30Z","title":"ArtGPT-4: Towards Artistic-understanding Large Vision-Language Models\n with Enhanced Adapter","summary":" The success of large language models (LLMs) has inspired an emerging research\nfield of multimodal learning. However, a grand challenge of exploiting LLMs for\nmultimodal learning is the size of pre-trained LLMs which are always with\nbillions of parameters. To tackle this challenge, models such as MiniGPT-4 and\nLLaVA have been developed to fine-tune the pre-trained models using fewer\nparameters. Despite their promising performance, these models remain limited in\ntheir understanding of artistic imagery. To facilitate better\nartistic-understanding, in this paper, we propose ArtGPT-4, a pioneering large\nvision-language model tailored to address the limitations of existing models in\nartistic comprehension. The key innovation of ArtGPT-4 lies in its craft for\nthe sophisticated challenge of artistic image comprehension, setting it apart\nfrom other models that overlook fine details for broader themes. Specifically,\nit works by integrating some specialized adapter layers into the LLM, enabling\nthe model to more efficiently and effectively parse and interpret complex\nvisual tokens, instead of fine-tuning the whole LLM as in the existing method.\nArtGPT-4 has demonstrated its outstanding performance on the efficiency:\nutilizing a Tesla A100 device, its training can be completed in mere 2 hours\nwith an image-text pair dataset comprising approximately 0.52M entries.\nAdditionally, ArtGPT-4 has also achieved state-of-the-art performance on the\nArtEmis and ArtEmis-v2.0 datasets as well as the benchmarks established in this\nwork, lagging behind professional artists' descriptions by a negligible 0.15\npoints on a 6-point scale. The outstanding performance of ArtGPT-4 shows that\nit can render images with an artistic-understanding and convey the emotions\nthey inspire, mirroring human interpretation. The code and the pre-trained\nmodel are accessible in \\url{https://github.com/DLYuanGod/ArtGPT-4}.\n","authors":["Zhengqing Yuan","Yunhong He","Kun Wang","Yanfang Ye","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2305.07490v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16862v2","updated":"2024-04-04T18:53:58Z","published":"2023-12-28T07:11:41Z","title":"TinyGPT-V: Efficient Multimodal Large Language Model via Small Backbones","summary":" In recent years, multimodal large language models (MLLMs) such as GPT-4V have\ndemonstrated remarkable advancements, excelling in a variety of vision-language\ntasks. Despite their prowess, the closed-source nature and computational\ndemands of such models limit their accessibility and applicability. This study\nintroduces TinyGPT-V, a novel open-source MLLM, designed for efficient training\nand inference across various vision-language tasks, including image captioning\n(IC) and visual question answering (VQA). Leveraging a compact yet powerful\narchitecture, TinyGPT-V integrates the Phi-2 language model with pre-trained\nvision encoders, utilizing a unique mapping module for visual and linguistic\ninformation fusion. With a training regimen optimized for small backbones and\nemploying a diverse dataset amalgam, TinyGPT-V requires significantly lower\ncomputational resources 24GB for training and as little as 8GB for inference\nwithout compromising on performance. Our experiments demonstrate that\nTinyGPT-V, with its language model 2.8 billion parameters, achieves comparable\nresults in VQA and image inference tasks to its larger counterparts while being\nuniquely suited for deployment on resource-constrained devices through\ninnovative quantization techniques. This work not only paves the way for more\naccessible and efficient MLLMs but also underscores the potential of smaller,\noptimized models in bridging the gap between high performance and computational\nefficiency in real-world applications. Additionally, this paper introduces a\nnew approach to multimodal large language models using smaller backbones. Our\ncode and training weights are available in\n\\url{https://github.com/DLYuanGod/TinyGPT-V}.\n","authors":["Zhengqing Yuan","Zhaoxu Li","Weiran Huang","Yanfang Ye","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2312.16862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03754v1","updated":"2024-04-04T18:50:58Z","published":"2024-04-04T18:50:58Z","title":"Data Science for Geographic Information Systems","summary":" The integration of data science into Geographic Information Systems (GIS) has\nfacilitated the evolution of these tools into complete spatial analysis\nplatforms. The adoption of machine learning and big data techniques has\nequipped these platforms with the capacity to handle larger amounts of\nincreasingly complex data, transcending the limitations of more traditional\napproaches. This work traces the historical and technical evolution of data\nscience and GIS as fields of study, highlighting the critical points of\nconvergence between domains, and underlining the many sectors that rely on this\nintegration. A GIS application is presented as a case study in the disaster\nmanagement sector where we utilize aerial data from Tr\\'oia, Portugal, to\nemphasize the process of insight extraction from raw data. We conclude by\noutlining prospects for future research in integration of these fields in\ngeneral, and the developed application in particular.\n","authors":["Afonso Oliveira","Nuno Fachada","João P. Matos-Carvalho"],"pdf_url":"https://arxiv.org/pdf/2404.03754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03743v1","updated":"2024-04-04T18:31:24Z","published":"2024-04-04T18:31:24Z","title":"Test Time Training for Industrial Anomaly Segmentation","summary":" Anomaly Detection and Segmentation (AD&S) is crucial for industrial quality\ncontrol. While existing methods excel in generating anomaly scores for each\npixel, practical applications require producing a binary segmentation to\nidentify anomalies. Due to the absence of labeled anomalies in many real\nscenarios, standard practices binarize these maps based on some statistics\nderived from a validation set containing only nominal samples, resulting in\npoor segmentation performance. This paper addresses this problem by proposing a\ntest time training strategy to improve the segmentation performance. Indeed, at\ntest time, we can extract rich features directly from anomalous samples to\ntrain a classifier that can discriminate defects effectively. Our general\napproach can work downstream to any AD&S method that provides an anomaly score\nmap as output, even in multimodal settings. We demonstrate the effectiveness of\nour approach over baselines through extensive experimentation and evaluation on\nMVTec AD and MVTec 3D-AD.\n","authors":["Alex Costanzino","Pierluigi Zama Ramirez","Mirko Del Moro","Agostino Aiezzo","Giuseppe Lisanti","Samuele Salti","Luigi Di Stefano"],"pdf_url":"https://arxiv.org/pdf/2404.03743v1.pdf","comment":"Accepted at VAND 2.0, CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.01112v3","updated":"2024-04-04T18:31:05Z","published":"2024-04-01T13:38:16Z","title":"Few-shot point cloud reconstruction and denoising via learned Guassian\n splats renderings and fine-tuned diffusion features","summary":" Existing deep learning methods for the reconstruction and denoising of point\nclouds rely on small datasets of 3D shapes. We circumvent the problem by\nleveraging deep learning methods trained on billions of images. We propose a\nmethod to reconstruct point clouds from few images and to denoise point clouds\nfrom their rendering by exploiting prior knowledge distilled from image-based\ndeep learning models. To improve reconstruction in constraint settings, we\nregularize the training of a differentiable renderer with hybrid surface and\nappearance by introducing semantic consistency supervision. In addition, we\npropose a pipeline to finetune Stable Diffusion to denoise renderings of noisy\npoint clouds and we demonstrate how these learned filters can be used to remove\npoint cloud noise coming without 3D supervision. We compare our method with DSS\nand PointRadiance and achieved higher quality 3D reconstruction on the\nSketchfab Testset and SCUT Dataset.\n","authors":["Pietro Bonazzi"],"pdf_url":"https://arxiv.org/pdf/2404.01112v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01887v2","updated":"2024-04-04T18:29:00Z","published":"2024-04-02T12:26:17Z","title":"3D scene generation from scene graphs and self-attention","summary":" Synthesizing realistic and diverse indoor 3D scene layouts in a controllable\nfashion opens up applications in simulated navigation and virtual reality. As\nconcise and robust representations of a scene, scene graphs have proven to be\nwell-suited as the semantic control on the generated layout. We present a\nvariant of the conditional variational autoencoder (cVAE) model to synthesize\n3D scenes from scene graphs and floor plans. We exploit the properties of\nself-attention layers to capture high-level relationships between objects in a\nscene, and use these as the building blocks of our model. Our model, leverages\ngraph transformers to estimate the size, dimension and orientation of the\nobjects in a room while satisfying relationships in the given scene graph. Our\nexperiments shows self-attention layers leads to sparser (7.9x compared to\nGraphto3D) and more diverse scenes (16%).\n","authors":["Pietro Bonazzi"],"pdf_url":"https://arxiv.org/pdf/2404.01887v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03736v1","updated":"2024-04-04T18:05:18Z","published":"2024-04-04T18:05:18Z","title":"SC4D: Sparse-Controlled Video-to-4D Generation and Motion Transfer","summary":" Recent advances in 2D/3D generative models enable the generation of dynamic\n3D objects from a single-view video. Existing approaches utilize score\ndistillation sampling to form the dynamic scene as dynamic NeRF or dense 3D\nGaussians. However, these methods struggle to strike a balance among reference\nview alignment, spatio-temporal consistency, and motion fidelity under\nsingle-view conditions due to the implicit nature of NeRF or the intricate\ndense Gaussian motion prediction. To address these issues, this paper proposes\nan efficient, sparse-controlled video-to-4D framework named SC4D, that\ndecouples motion and appearance to achieve superior video-to-4D generation.\nMoreover, we introduce Adaptive Gaussian (AG) initialization and Gaussian\nAlignment (GA) loss to mitigate shape degeneration issue, ensuring the fidelity\nof the learned motion and shape. Comprehensive experimental results demonstrate\nthat our method surpasses existing methods in both quality and efficiency. In\naddition, facilitated by the disentangled modeling of motion and appearance of\nSC4D, we devise a novel application that seamlessly transfers the learned\nmotion onto a diverse array of 4D entities according to textual descriptions.\n","authors":["Zijie Wu","Chaohui Yu","Yanqin Jiang","Chenjie Cao","Fan Wang","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2404.03736v1.pdf","comment":"Project Page: https://sc4d.github.io/"},{"id":"http://arxiv.org/abs/2304.01834v4","updated":"2024-04-04T18:01:47Z","published":"2023-04-04T14:39:44Z","title":"Neural Field Convolutions by Repeated Differentiation","summary":" Neural fields are evolving towards a general-purpose continuous\nrepresentation for visual computing. Yet, despite their numerous appealing\nproperties, they are hardly amenable to signal processing. As a remedy, we\npresent a method to perform general continuous convolutions with general\ncontinuous signals such as neural fields. Observing that piecewise polynomial\nkernels reduce to a sparse set of Dirac deltas after repeated differentiation,\nwe leverage convolution identities and train a repeated integral field to\nefficiently execute large-scale convolutions. We demonstrate our approach on a\nvariety of data modalities and spatially-varying kernels.\n","authors":["Ntumba Elie Nsampi","Adarsh Djeacoumar","Hans-Peter Seidel","Tobias Ritschel","Thomas Leimkühler"],"pdf_url":"https://arxiv.org/pdf/2304.01834v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04125v1","updated":"2024-04-04T17:58:02Z","published":"2024-04-04T17:58:02Z","title":"No \"Zero-Shot\" Without Exponential Data: Pretraining Concept Frequency\n Determines Multimodal Model Performance","summary":" Web-crawled pretraining datasets underlie the impressive \"zero-shot\"\nevaluation performance of multimodal models, such as CLIP for\nclassification/retrieval and Stable-Diffusion for image generation. However, it\nis unclear how meaningful the notion of \"zero-shot\" generalization is for such\nmultimodal models, as it is not known to what extent their pretraining datasets\nencompass the downstream concepts targeted for during \"zero-shot\" evaluation.\nIn this work, we ask: How is the performance of multimodal models on downstream\nconcepts influenced by the frequency of these concepts in their pretraining\ndatasets? We comprehensively investigate this question across 34 models and\nfive standard pretraining datasets (CC-3M, CC-12M, YFCC-15M, LAION-400M,\nLAION-Aesthetics), generating over 300GB of data artifacts. We consistently\nfind that, far from exhibiting \"zero-shot\" generalization, multimodal models\nrequire exponentially more data to achieve linear improvements in downstream\n\"zero-shot\" performance, following a sample inefficient log-linear scaling\ntrend. This trend persists even when controlling for sample-level similarity\nbetween pretraining and downstream datasets, and testing on purely synthetic\ndata distributions. Furthermore, upon benchmarking models on long-tailed data\nsampled based on our analysis, we demonstrate that multimodal models across the\nboard perform poorly. We contribute this long-tail test set as the \"Let it\nWag!\" benchmark to further research in this direction. Taken together, our\nstudy reveals an exponential need for training data which implies that the key\nto \"zero-shot\" generalization capabilities under large-scale training paradigms\nremains to be found.\n","authors":["Vishaal Udandarao","Ameya Prabhu","Adhiraj Ghosh","Yash Sharma","Philip H. S. Torr","Adel Bibi","Samuel Albanie","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2404.04125v1.pdf","comment":"Extended version of the short paper accepted at DPFM, ICLR'24"},{"id":"http://arxiv.org/abs/2404.03713v1","updated":"2024-04-04T17:46:20Z","published":"2024-04-04T17:46:20Z","title":"Explaining Explainability: Understanding Concept Activation Vectors","summary":" Recent interpretability methods propose using concept-based explanations to\ntranslate the internal representations of deep learning models into a language\nthat humans are familiar with: concepts. This requires understanding which\nconcepts are present in the representation space of a neural network. One\npopular method for finding concepts is Concept Activation Vectors (CAVs), which\nare learnt using a probe dataset of concept exemplars. In this work, we\ninvestigate three properties of CAVs. CAVs may be: (1) inconsistent between\nlayers, (2) entangled with different concepts, and (3) spatially dependent.\nEach property provides both challenges and opportunities in interpreting\nmodels. We introduce tools designed to detect the presence of these properties,\nprovide insight into how they affect the derived explanations, and provide\nrecommendations to minimise their impact. Understanding these properties can be\nused to our advantage. For example, we introduce spatially dependent CAVs to\ntest if a model is translation invariant with respect to a specific concept and\nclass. Our experiments are performed on ImageNet and a new synthetic dataset,\nElements. Elements is designed to capture a known ground truth relationship\nbetween concepts and classes. We release this dataset to facilitate further\nresearch in understanding and evaluating interpretability methods.\n","authors":["Angus Nicolson","Lisa Schut","J. Alison Noble","Yarin Gal"],"pdf_url":"https://arxiv.org/pdf/2404.03713v1.pdf","comment":"(54 pages, 39 figures)"}]},"2024-04-05T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2404.04253v1","updated":"2024-04-05T17:58:37Z","published":"2024-04-05T17:58:37Z","title":"Growing Q-Networks: Solving Continuous Control Tasks with Adaptive\n Control Resolution","summary":" Recent reinforcement learning approaches have shown surprisingly strong\ncapabilities of bang-bang policies for solving continuous control benchmarks.\nThe underlying coarse action space discretizations often yield favourable\nexploration characteristics while final performance does not visibly suffer in\nthe absence of action penalization in line with optimal control theory. In\nrobotics applications, smooth control signals are commonly preferred to reduce\nsystem wear and energy efficiency, but action costs can be detrimental to\nexploration during early training. In this work, we aim to bridge this\nperformance gap by growing discrete action spaces from coarse to fine control\nresolution, taking advantage of recent results in decoupled Q-learning to scale\nour approach to high-dimensional action spaces up to dim(A) = 38. Our work\nindicates that an adaptive control resolution in combination with value\ndecomposition yields simple critic-only algorithms that yield surprisingly\nstrong performance on continuous control tasks.\n","authors":["Tim Seyde","Peter Werner","Wilko Schwarting","Markus Wulfmeier","Daniela Rus"],"pdf_url":"https://arxiv.org/pdf/2404.04253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08240v2","updated":"2024-04-05T17:56:12Z","published":"2023-12-13T16:01:50Z","title":"CenterGrasp: Object-Aware Implicit Representation Learning for\n Simultaneous Shape Reconstruction and 6-DoF Grasp Estimation","summary":" Reliable object grasping is a crucial capability for autonomous robots.\nHowever, many existing grasping approaches focus on general clutter removal\nwithout explicitly modeling objects and thus only relying on the visible local\ngeometry. We introduce CenterGrasp, a novel framework that combines object\nawareness and holistic grasping. CenterGrasp learns a general object prior by\nencoding shapes and valid grasps in a continuous latent space. It consists of\nan RGB-D image encoder that leverages recent advances to detect objects and\ninfer their pose and latent code, and a decoder to predict shape and grasps for\neach object in the scene. We perform extensive experiments on simulated as well\nas real-world cluttered scenes and demonstrate strong scene reconstruction and\n6-DoF grasp-pose estimation performance. Compared to the state of the art,\nCenterGrasp achieves an improvement of 38.5 mm in shape reconstruction and 33\npercentage points on average in grasp success. We make the code and trained\nmodels publicly available at http://centergrasp.cs.uni-freiburg.de.\n","authors":["Eugenio Chisari","Nick Heppert","Tim Welschehold","Wolfram Burgard","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2312.08240v2.pdf","comment":"Accepted at RA-L. Video, code and models available at\n http://centergrasp.cs.uni-freiburg.de"},{"id":"http://arxiv.org/abs/2404.04249v1","updated":"2024-04-05T17:54:12Z","published":"2024-04-05T17:54:12Z","title":"Humanoid Robots at work: where are we ?","summary":" Launched by Elon Musk and its Optimus, we are witnessing a new race in which\nmany companies have already engaged. The objective it to put at work a new\ngeneration of humanoid robots in demanding industrial environments within 2 or\n3 years. Is this objective realistic ? The aim of this document and its main\ncontributions is to provide some hints by covering the following topics: First\nan analysis of 12 companies based on eight criteria that will help us to\ndistinguish companies based on their maturity and approach to the market;\nsecond as these humanoids are very complex systems we will provide an overview\nof the technological challenges to be addressed; third when humanoids are\ndeployed at scale, Operation and Maintenance become critical and the we will\nexplore what is new with these complex machines; Finally Pilots are the last\nstep to test the feasibility of a new system before mass deployment. This is an\nimportant step to test the maturity of a product and the strategy of the\nhumanoid supplier to address a market and two pragmatic approaches will be\ndiscussed.\n","authors":["Fabrice R. Noreils"],"pdf_url":"https://arxiv.org/pdf/2404.04249v1.pdf","comment":"30 pages 16 figures"},{"id":"http://arxiv.org/abs/2404.04241v1","updated":"2024-04-05T17:41:59Z","published":"2024-04-05T17:41:59Z","title":"Modeling Kinematic Uncertainty of Tendon-Driven Continuum Robots via\n Mixture Density Networks","summary":" Tendon-driven continuum robot kinematic models are frequently computationally\nexpensive, inaccurate due to unmodeled effects, or both. In particular,\nunmodeled effects produce uncertainties that arise during the robot's operation\nthat lead to variability in the resulting geometry. We propose a novel solution\nto these issues through the development of a Gaussian mixture kinematic model.\nWe train a mixture density network to output a Gaussian mixture model\nrepresentation of the robot geometry given the current tendon displacements.\nThis model computes a probability distribution that is more representative of\nthe true distribution of geometries at a given configuration than a model that\noutputs a single geometry, while also reducing the computation time. We\ndemonstrate one use of this model through a trajectory optimization method that\nexplicitly reasons about the workspace uncertainty to minimize the probability\nof collision.\n","authors":["Jordan Thompson","Brian Y. Cho","Daniel S. Brown","Alan Kuntz"],"pdf_url":"https://arxiv.org/pdf/2404.04241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04220v1","updated":"2024-04-05T17:06:03Z","published":"2024-04-05T17:06:03Z","title":"Multi-modal perception for soft robotic interactions using generative\n models","summary":" Perception is essential for the active interaction of physical agents with\nthe external environment. The integration of multiple sensory modalities, such\nas touch and vision, enhances this perceptual process, creating a more\ncomprehensive and robust understanding of the world. Such fusion is\nparticularly useful for highly deformable bodies such as soft robots.\nDeveloping a compact, yet comprehensive state representation from multi-sensory\ninputs can pave the way for the development of complex control strategies. This\npaper introduces a perception model that harmonizes data from diverse\nmodalities to build a holistic state representation and assimilate essential\ninformation. The model relies on the causality between sensory input and\nrobotic actions, employing a generative model to efficiently compress fused\ninformation and predict the next observation. We present, for the first time, a\nstudy on how touch can be predicted from vision and proprioception on soft\nrobots, the importance of the cross-modal generation and why this is essential\nfor soft robotic interactions in unstructured environments.\n","authors":["Enrico Donato","Egidio Falotico","Thomas George Thuruthel"],"pdf_url":"https://arxiv.org/pdf/2404.04220v1.pdf","comment":"Accepted for presentation at IEEE RoboSoft 2024"},{"id":"http://arxiv.org/abs/2404.04219v1","updated":"2024-04-05T17:05:45Z","published":"2024-04-05T17:05:45Z","title":"Continual Policy Distillation of Reinforcement Learning-based\n Controllers for Soft Robotic In-Hand Manipulation","summary":" Dexterous manipulation, often facilitated by multi-fingered robotic hands,\nholds solid impact for real-world applications. Soft robotic hands, due to\ntheir compliant nature, offer flexibility and adaptability during object\ngrasping and manipulation. Yet, benefits come with challenges, particularly in\nthe control development for finger coordination. Reinforcement Learning (RL)\ncan be employed to train object-specific in-hand manipulation policies, but\nlimiting adaptability and generalizability. We introduce a Continual Policy\nDistillation (CPD) framework to acquire a versatile controller for in-hand\nmanipulation, to rotate different objects in shape and size within a\nfour-fingered soft gripper. The framework leverages Policy Distillation (PD) to\ntransfer knowledge from expert policies to a continually evolving student\npolicy network. Exemplar-based rehearsal methods are then integrated to\nmitigate catastrophic forgetting and enhance generalization. The performance of\nthe CPD framework over various replay strategies demonstrates its effectiveness\nin consolidating knowledge from multiple experts and achieving versatile and\nadaptive behaviours for in-hand manipulation tasks.\n","authors":["Lanpei Li","Enrico Donato","Vincenzo Lomonaco","Egidio Falotico"],"pdf_url":"https://arxiv.org/pdf/2404.04219v1.pdf","comment":"Accepted for presentation at IEEE RoboSoft 2024"},{"id":"http://arxiv.org/abs/2404.04197v1","updated":"2024-04-05T16:08:59Z","published":"2024-04-05T16:08:59Z","title":"Convex MPC and Thrust Allocation with Deadband for Spacecraft Rendezvous","summary":" This paper delves into a rendezvous scenario involving a chaser and a target\nspacecraft, focusing on the application of Model Predictive Control (MPC) to\ndesign a controller capable of guiding the chaser toward the target. The\noperational principle of spacecraft thrusters, requiring a minimum activation\ntime that leads to the existence of a control deadband, introduces\nmixed-integer constraints into the optimization, posing a considerable\ncomputational challenge due to the exponential complexity on the number of\ninteger constraints. We address this complexity by presenting two solver\nalgorithms that efficiently approximate the optimal solution in significantly\nless time than standard solvers, making them well-suited for real-time\napplications.\n","authors":["Pedro Taborda","Hugo Matias","Daniel Silvestre","Pedro Lourenço"],"pdf_url":"https://arxiv.org/pdf/2404.04197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04193v1","updated":"2024-04-05T16:05:42Z","published":"2024-04-05T16:05:42Z","title":"ToolEENet: Tool Affordance 6D Pose Estimation","summary":" The exploration of robotic dexterous hands utilizing tools has recently\nattracted considerable attention. A significant challenge in this field is the\nprecise awareness of a tool's pose when grasped, as occlusion by the hand often\ndegrades the quality of the estimation. Additionally, the tool's overall pose\noften fails to accurately represent the contact interaction, thereby limiting\nthe effectiveness of vision-guided, contact-dependent activities. To overcome\nthis limitation, we present the innovative TOOLEE dataset, which, to the best\nof our knowledge, is the first to feature affordance segmentation of a tool's\nend-effector (EE) along with its defined 6D pose based on its usage.\nFurthermore, we propose the ToolEENet framework for accurate 6D pose estimation\nof the tool's EE. This framework begins by segmenting the tool's EE from raw\nRGBD data, then uses a diffusion model-based pose estimator for 6D pose\nestimation at a category-specific level. Addressing the issue of symmetry in\npose estimation, we introduce a symmetry-aware pose representation that\nenhances the consistency of pose estimation. Our approach excels in this field,\ndemonstrating high levels of precision and generalization. Furthermore, it\nshows great promise for application in contact-based manipulation scenarios.\nAll data and codes are available on the project website:\nhttps://yuyangtu.github.io/projectToolEENet.html\n","authors":["Yunlong Wang","Lei Zhang","Yuyang Tu","Hui Zhang","Kaixin Bai","Zhaopeng Chen","Jianwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04186v1","updated":"2024-04-05T15:59:44Z","published":"2024-04-05T15:59:44Z","title":"Probabilistically Informed Robot Object Search with Multiple Regions","summary":" The increasing use of autonomous robot systems in hazardous environments\nunderscores the need for efficient search and rescue operations. Despite\nsignificant advancements, existing literature on object search often falls\nshort in overcoming the difficulty of long planning horizons and dealing with\nsensor limitations, such as noise. This study introduces a novel approach that\nformulates the search problem as a belief Markov decision processes with\noptions (BMDP-O) to make Monte Carlo tree search (MCTS) a viable tool for\novercoming these challenges in large scale environments. The proposed\nformulation incorporates sequences of actions (options) to move between regions\nof interest, enabling the algorithm to efficiently scale to large environments.\nThis approach also enables the use of customizable fields of view, for use with\nmultiple types of sensors. Experimental results demonstrate the superiority of\nthis approach in large environments when compared to the problem without\noptions and alternative tools such as receding horizon planners. Given compute\ntime for the proposed formulation is relatively high, a further approximated\n\"lite\" formulation is proposed. The lite formulation finds objects in a\ncomparable number of steps with faster computation.\n","authors":["Matthew Collins","Jared J. Beard","Nicholas Ohi","Yu Gu"],"pdf_url":"https://arxiv.org/pdf/2404.04186v1.pdf","comment":"6 pages, 7 figures. Submitted to the 2024 IEEE/RSJ International\n Conference on Intelligent Robots and Systems in Abu Dhabi, UAE (Oct 14-18,\n 2024)"},{"id":"http://arxiv.org/abs/2404.04123v1","updated":"2024-04-05T14:22:56Z","published":"2024-04-05T14:22:56Z","title":"Designing Robots to Help Women","summary":" Robots are being designed to help people in an increasing variety of\nsettings--but seemingly little attention has been given so far to the specific\nneeds of women, who represent roughly half of the world's population but are\nhighly underrepresented in robotics. Here we used a speculative prototyping\napproach to explore this expansive design space: First, we identified some\npotential challenges of interest, including crimes and illnesses that\ndisproportionately affect women, as well as potential opportunities for\ndesigners, which were visualized in five sketches. Then, one of the sketched\nscenarios was further explored by developing a prototype, of a robotic helper\ndrone equipped with computer vision to detect hidden cameras that could be used\nto spy on women. While object detection introduced some errors, hidden cameras\nwere identified with a reasonable accuracy of 80\\% (Intersection over Union\n(IoU) score: 0.40). Our aim is that the identified challenges and opportunities\ncould help spark discussion and inspire designers, toward realizing a safer,\nmore inclusive future through responsible use of technology.\n","authors":["Martin Cooney","Lena Klasén","Fernando Alonso-Fernandez"],"pdf_url":"https://arxiv.org/pdf/2404.04123v1.pdf","comment":"10 pages, submitted 2024-4-5 to SCAI"},{"id":"http://arxiv.org/abs/2310.12729v2","updated":"2024-04-05T14:02:12Z","published":"2023-10-19T13:29:05Z","title":"Advancements in Radar Odometry","summary":" Radar odometry estimation has emerged as a critical technique in the field of\nautonomous navigation, providing robust and reliable motion estimation under\nvarious environmental conditions. Despite its potential, the complex nature of\nradar signals and the inherent challenges associated with processing these\nsignals have limited the widespread adoption of this technology. This paper\naims to address these challenges by proposing novel improvements to an existing\nmethod for radar odometry estimation, designed to enhance accuracy and\nreliability in diverse scenarios. Our pipeline consists of filtering, motion\ncompensation, oriented surface points computation, smoothing, one-to-many radar\nscan registration, and pose refinement. The developed method enforces local\nunderstanding of the scene, by adding additional information through smoothing\ntechniques, and alignment of consecutive scans, as a refinement posterior to\nthe one-to-many registration. We present an in-depth investigation of the\ncontribution of each improvement to the localization accuracy, and we benchmark\nour system on the sequences of the main datasets for radar understanding, i.e.,\nthe Oxford Radar RobotCar, MulRan, and Boreas datasets. The proposed pipeline\nis able to achieve superior results, on all scenarios considered and under\nharsh environmental constraints.\n","authors":["Matteo Frosi","Mirko Usuelli","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2310.12729v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04079v1","updated":"2024-04-05T13:12:17Z","published":"2024-04-05T13:12:17Z","title":"Self-Sensing Feedback Control of an Electrohydraulic Robotic Shoulder","summary":" The human shoulder, with its glenohumeral joint, tendons, ligaments, and\nmuscles, allows for the execution of complex tasks with precision and\nefficiency. However, current robotic shoulder designs lack the compliance and\ncompactness inherent in their biological counterparts. A major limitation of\nthese designs is their reliance on external sensors like rotary encoders, which\nrestrict mechanical joint design and introduce bulk to the system. To address\nthis constraint, we present a bio-inspired antagonistic robotic shoulder with\ntwo degrees of freedom powered by self-sensing hydraulically amplified\nself-healing electrostatic actuators. Our artificial muscle design decouples\nthe high-voltage electrostatic actuation from the pair of low-voltage\nself-sensing electrodes. This approach allows for proprioceptive feedback\ncontrol of trajectories in the task space while eliminating the necessity for\nany additional sensors. We assess the platform's efficacy by comparing it to a\nfeedback control based on position data provided by a motion capture system.\nThe study demonstrates closed-loop controllable robotic manipulators based on\nan inherent self-sensing capability of electrohydraulic actuators. The proposed\narchitecture can serve as a basis for complex musculoskeletal joint\narrangements.\n","authors":["Clemens C. Christoph","Amirhossein Kazemipour","Michel R. Vogt","Yu Zhang","Robert K. Katzschmann"],"pdf_url":"https://arxiv.org/pdf/2404.04079v1.pdf","comment":"7 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.04071v1","updated":"2024-04-05T12:55:04Z","published":"2024-04-05T12:55:04Z","title":"High-Frequency Capacitive Sensing for Electrohydraulic Soft Actuators","summary":" The need for compliant and proprioceptive actuators has grown more evident in\npursuing more adaptable and versatile robotic systems. Hydraulically Amplified\nSelf-Healing Electrostatic (HASEL) actuators offer distinctive advantages with\ntheir inherent softness and flexibility, making them promising candidates for\nvarious robotic tasks, including delicate interactions with humans and animals,\nbiomimetic locomotion, prosthetics, and exoskeletons. This has resulted in a\ngrowing interest in the capacitive self-sensing capabilities of HASEL actuators\nto create miniature displacement estimation circuitry that does not require\nexternal sensors. However, achieving HASEL self-sensing for actuation\nfrequencies above 1 Hz and with miniature high-voltage power supplies has\nremained limited. In this paper, we introduce the F-HASEL actuator, which adds\nan additional electrode pair used exclusively for capacitive sensing to a\nPeano-HASEL actuator. We demonstrate displacement estimation of the F-HASEL\nduring high-frequency actuation up to 20 Hz and during external loading using\nminiaturized circuitry comprised of low-cost off-the-shelf components and a\nminiature high-voltage power supply. Finally, we propose a circuitry to\nestimate the displacement of multiple F-HASELs and demonstrate it in a wearable\napplication to track joint rotations of a virtual reality user in real-time.\n","authors":["Michel R. Vogt","Maximilian Eberlein","Clemens C. Christoph","Felix Baumann","Fabrice Bourquin","Wim Wende","Fabio Schaub","Amirhossein Kazemipour","Robert K. Katzschmann"],"pdf_url":"https://arxiv.org/pdf/2404.04071v1.pdf","comment":"8 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.04069v1","updated":"2024-04-05T12:52:17Z","published":"2024-04-05T12:52:17Z","title":"Bidirectional Human Interactive AI Framework for Social Robot Navigation","summary":" Trustworthiness is a crucial concept in the context of human-robot\ninteraction. Cooperative robots must be transparent regarding their\ndecision-making process, especially when operating in a human-oriented\nenvironment. This paper presents a comprehensive end-to-end framework aimed at\nfostering trustworthy bidirectional human-robot interaction in collaborative\nenvironments for the social navigation of mobile robots. Our method enables a\nmobile robot to predict the trajectory of people and adjust its route in a\nsocially-aware manner. In case of conflict between human and robot decisions,\ndetected through visual examination, the route is dynamically modified based on\nhuman preference while verbal communication is maintained. We present our\npipeline, framework design, and preliminary experiments that form the\nfoundation of our proposition.\n","authors":["Tuba Girgin","Emre Girgin","Yigit Yildirim","Emre Ugur","Mehmet Haklidir"],"pdf_url":"https://arxiv.org/pdf/2404.04069v1.pdf","comment":"Accepted by Robot Trust for Symbiotic Societies (RTSS) Workshop at\n ICRA 2024"},{"id":"http://arxiv.org/abs/2404.04066v1","updated":"2024-04-05T12:45:10Z","published":"2024-04-05T12:45:10Z","title":"VoicePilot: Harnessing LLMs as Speech Interfaces for Physically\n Assistive Robots","summary":" Physically assistive robots present an opportunity to significantly increase\nthe well-being and independence of individuals with motor impairments or other\nforms of disability who are unable to complete activities of daily living.\nSpeech interfaces, especially ones that utilize Large Language Models (LLMs),\ncan enable individuals to effectively and naturally communicate high-level\ncommands and nuanced preferences to robots. Frameworks for integrating LLMs as\ninterfaces to robots for high level task planning and code generation have been\nproposed, but fail to incorporate human-centric considerations which are\nessential while developing assistive interfaces. In this work, we present a\nframework for incorporating LLMs as speech interfaces for physically assistive\nrobots, constructed iteratively with 3 stages of testing involving a feeding\nrobot, culminating in an evaluation with 11 older adults at an independent\nliving facility. We use both quantitative and qualitative data from the final\nstudy to validate our framework and additionally provide design guidelines for\nusing LLMs as speech interfaces for assistive robots. Videos and supporting\nfiles are located on our project website:\nhttps://sites.google.com/andrew.cmu.edu/voicepilot/\n","authors":["Akhil Padmanabha","Jessie Yuan","Janavi Gupta","Zulekha Karachiwalla","Carmel Majidi","Henny Admoni","Zackory Erickson"],"pdf_url":"https://arxiv.org/pdf/2404.04066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.01414v2","updated":"2024-04-05T12:03:26Z","published":"2022-07-04T13:49:29Z","title":"Controlling the Cascade: Kinematic Planning for N-ball Toss Juggling","summary":" Dynamic movements are ubiquitous in human motor behavior as they tend to be\nmore efficient and can solve a broader range of skill domains than their\nquasi-static counterparts. For decades, robotic juggling tasks have been among\nthe most frequently studied dynamic manipulation problems since the required\ndynamic dexterity can be scaled to arbitrarily high difficulty. However,\nsuccessful approaches have been limited to basic juggling skills, indicating a\nlack of understanding of the required constraints for dexterous toss juggling.\nWe present a detailed analysis of the toss juggling task, identifying the key\nchallenges and formalizing it as a trajectory optimization problem. Building on\nour state-of-the-art, real-world toss juggling platform, we reach the\ntheoretical limits of toss juggling in simulation, evaluate a resulting\nreal-time controller in environments of varying difficulty and achieve robust\ntoss juggling of up to 17 balls on two anthropomorphic manipulators.\n","authors":["Kai Ploeger","Jan Peters"],"pdf_url":"https://arxiv.org/pdf/2207.01414v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03493v2","updated":"2024-04-05T11:42:57Z","published":"2024-04-04T14:48:26Z","title":"A Methodology to Study the Impact of Spiking Neural Network Parameters\n considering Event-Based Automotive Data","summary":" Autonomous Driving (AD) systems are considered as the future of human\nmobility and transportation. Solving computer vision tasks such as image\nclassification and object detection/segmentation, with high accuracy and low\npower/energy consumption, is highly needed to realize AD systems in real life.\nThese requirements can potentially be satisfied by Spiking Neural Networks\n(SNNs). However, the state-of-the-art works in SNN-based AD systems still focus\non proposing network models that can achieve high accuracy, and they have not\nsystematically studied the roles of SNN parameters when used for learning\nevent-based automotive data. Therefore, we still lack understanding of how to\neffectively develop SNN models for AD systems. Toward this, we propose a novel\nmethodology to systematically study and analyze the impact of SNN parameters\nconsidering event-based automotive data, then leverage this analysis for\nenhancing SNN developments. To do this, we first explore different settings of\nSNN parameters that directly affect the learning mechanism (i.e., batch size,\nlearning rate, neuron threshold potential, and weight decay), then analyze the\naccuracy results. Afterward, we propose techniques that jointly improve SNN\naccuracy and reduce training time. Experimental results show that our\nmethodology can improve the SNN models for AD systems than the\nstate-of-the-art, as it achieves higher accuracy (i.e., 86%) for the NCARS\ndataset, and it can also achieve iso-accuracy (i.e., ~85% with standard\ndeviation less than 0.5%) while speeding up the training time by 1.9x. In this\nmanner, our research work provides a set of guidelines for SNN parameter\nenhancements, thereby enabling the practical developments of SNN-based AD\nsystems.\n","authors":["Iqra Bano","Rachmad Vidya Wicaksana Putra","Alberto Marchisio","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2404.03493v2.pdf","comment":"7 pages, 13 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.05194v3","updated":"2024-04-05T11:14:39Z","published":"2023-08-09T19:21:50Z","title":"Evaluating Pedestrian Trajectory Prediction Methods with Respect to\n Autonomous Driving","summary":" In this paper, we assess the state of the art in pedestrian trajectory\nprediction within the context of generating single trajectories, a critical\naspect aligning with the requirements in autonomous systems. The evaluation is\nconducted on the widely-used ETH/UCY dataset where the Average Displacement\nError (ADE) and the Final Displacement Error (FDE) are reported. Alongside\nthis, we perform an ablation study to investigate the impact of the observed\nmotion history on prediction performance. To evaluate the scalability of each\napproach when confronted with varying amounts of agents, the inference time of\neach model is measured. Following a quantitative analysis, the resulting\npredictions are compared in a qualitative manner, giving insight into the\nstrengths and weaknesses of current approaches. The results demonstrate that\nalthough a constant velocity model (CVM) provides a good approximation of the\noverall dynamics in the majority of cases, additional features need to be\nincorporated to reflect common pedestrian behavior observed. Therefore, this\nstudy presents a data-driven analysis with the intent to guide the future\ndevelopment of pedestrian trajectory prediction algorithms.\n","authors":["Nico Uhlemann","Felix Fent","Markus Lienkamp"],"pdf_url":"https://arxiv.org/pdf/2308.05194v3.pdf","comment":"Accepted in IEEE Transactions on Intelligent Transportation Systems\n (T-ITS); 11 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.04026v1","updated":"2024-04-05T11:14:19Z","published":"2024-04-05T11:14:19Z","title":"MM-Gaussian: 3D Gaussian-based Multi-modal Fusion for Localization and\n Reconstruction in Unbounded Scenes","summary":" Localization and mapping are critical tasks for various applications such as\nautonomous vehicles and robotics. The challenges posed by outdoor environments\npresent particular complexities due to their unbounded characteristics. In this\nwork, we present MM-Gaussian, a LiDAR-camera multi-modal fusion system for\nlocalization and mapping in unbounded scenes. Our approach is inspired by the\nrecently developed 3D Gaussians, which demonstrate remarkable capabilities in\nachieving high rendering quality and fast rendering speed. Specifically, our\nsystem fully utilizes the geometric structure information provided by\nsolid-state LiDAR to address the problem of inaccurate depth encountered when\nrelying solely on visual solutions in unbounded, outdoor scenarios.\nAdditionally, we utilize 3D Gaussian point clouds, with the assistance of\npixel-level gradient descent, to fully exploit the color information in photos,\nthereby achieving realistic rendering effects. To further bolster the\nrobustness of our system, we designed a relocalization module, which assists in\nreturning to the correct trajectory in the event of a localization failure.\nExperiments conducted in multiple scenarios demonstrate the effectiveness of\nour method.\n","authors":["Chenyang Wu","Yifan Duan","Xinran Zhang","Yu Sheng","Jianmin Ji","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04026v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.17448v2","updated":"2024-04-05T11:10:27Z","published":"2024-03-26T07:26:27Z","title":"Adaptive Line-Of-Sight guidance law based on vector fields path\n following for underactuated unmanned surface vehicle","summary":" The focus of this paper is to develop a methodology that enables an unmanned\nsurface vehicle (USV) to efficiently track a planned path. The introduction of\na vector field-based adaptive line of-sight guidance law (VFALOS) for accurate\ntrajectory tracking and minimizing the overshoot response time during USV\ntracking of curved paths improves the overall line-of-sight (LOS) guidance\nmethod. These improvements contribute to faster convergence to the desired\npath, reduce oscillations, and can mitigate the effects of persistent external\ndisturbances. It is shown that the proposed guidance law exhibits k-exponential\nstability when converging to the desired path consisting of straight and curved\nlines. The results in the paper show that the proposed method effectively\nimproves the accuracy of the USV tracking the desired path while ensuring the\nsafety of the USV work.\n","authors":["Jie Qi","Ronghua Wanga","Nailong Wu"],"pdf_url":"https://arxiv.org/pdf/2403.17448v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04011v1","updated":"2024-04-05T10:38:33Z","published":"2024-04-05T10:38:33Z","title":"Validation of critical maneuvers based on shared control","summary":" This paper presents the validation of shared control strategies for critical\nmaneuvers in automated driving systems. Shared control involves collaboration\nbetween the driver and automation, allowing both parties to actively engage and\ncooperate at different levels of the driving task. The involvement of the\ndriver adds complexity to the control loop, necessitating comprehensive\nvalidation methodologies. The proposed approach focuses on two critical\nmaneuvers: overtaking in low visibility scenarios and lateral evasive actions.\nA modular architecture with an arbitration module and shared control algorithms\nis implemented, primarily focusing on the lateral control of the vehicle. The\nvalidation is conducted using a dynamic simulator, involving 8 real drivers\ninteracting with a virtual environment. The results demonstrate improved safety\nand user acceptance, indicating the effectiveness of the shared control\nstrategies in comparison with no shared-control support. Future work involves\nimplementing shared control in drive-by-wire systems to enhance safety and\ndriver comfort during critical maneuvers. Overall, this research contributes to\nthe development and validation of shared control approaches in automated\ndriving systems.\n","authors":["Mauricio Marcano","Joseba Sarabia","Asier Zubizarreta","Sergio Díaz"],"pdf_url":"https://arxiv.org/pdf/2404.04011v1.pdf","comment":"8 pages, 19 figures. Published in IEEE 26th International Conference\n on Intelligent Transportation Systems (ITSC)"},{"id":"http://arxiv.org/abs/2404.04004v1","updated":"2024-04-05T10:27:48Z","published":"2024-04-05T10:27:48Z","title":"Towards Safe Robot Use with Edged or Pointed Objects: A Surrogate Study\n Assembling a Human Hand Injury Protection Database","summary":" The use of pointed or edged tools or objects is one of the most challenging\naspects of today's application of physical human-robot interaction (pHRI). One\nreason for this is that the severity of harm caused by such edged or pointed\nimpactors is less well studied than for blunt impactors. Consequently, the\nstandards specify well-reasoned force and pressure thresholds for blunt\nimpactors and advise avoiding any edges and corners in contacts. Nevertheless,\npointed or edged impactor geometries cannot be completely ruled out in real\npHRI applications. For example, to allow edged or pointed tools such as\nscrewdrivers near human operators, the knowledge of injury severity needs to be\nextended so that robot integrators can perform well-reasoned, time-efficient\nrisk assessments. In this paper, we provide the initial datasets on injury\nprevention for the human hand based on drop tests with surrogates for the human\nhand, namely pig claws and chicken drumsticks. We then demonstrate the ease and\nefficiency of robot use using the dataset for contact on two examples. Finally,\nour experiments provide a set of injuries that may also be expected for human\nsubjects under certain robot mass-velocity constellations in collisions. To\nextend this work, testing on human samples and a collaborative effort from\nresearch institutes worldwide is needed to create a comprehensive human injury\navoidance database for any pHRI scenario and thus for safe pHRI applications\nincluding edged and pointed geometries.\n","authors":["Robin Jeanne Kirschner","Carina M. Micheler","Yangcan Zhou","Sebastian Siegner","Mazin Hamad","Claudio Glowalla","Jan Neumann","Nader Rajaei","Rainer Burgkart","Sami Haddadin"],"pdf_url":"https://arxiv.org/pdf/2404.04004v1.pdf","comment":"accepted fo presentation at IEEE ICRA 2024"},{"id":"http://arxiv.org/abs/2404.02817v2","updated":"2024-04-05T09:06:00Z","published":"2024-04-03T15:38:36Z","title":"A Survey of Optimization-based Task and Motion Planning: From Classical\n To Learning Approaches","summary":" Task and Motion Planning (TAMP) integrates high-level task planning and\nlow-level motion planning to equip robots with the autonomy to effectively\nreason over long-horizon, dynamic tasks. Optimization-based TAMP focuses on\nhybrid optimization approaches that define goal conditions via objective\nfunctions and are capable of handling open-ended goals, robotic dynamics, and\nphysical interaction between the robot and the environment. Therefore,\noptimization-based TAMP is particularly suited to solve highly complex,\ncontact-rich locomotion and manipulation problems. This survey provides a\ncomprehensive review on optimization-based TAMP, covering (i) planning domain\nrepresentations, including action description languages and temporal logic,\n(ii) individual solution strategies for components of TAMP, including AI\nplanning and trajectory optimization (TO), and (iii) the dynamic interplay\nbetween logic-based task planning and model-based TO. A particular focus of\nthis survey is to highlight the algorithm structures to efficiently solve TAMP,\nespecially hierarchical and distributed approaches. Additionally, the survey\nemphasizes the synergy between the classical methods and contemporary\nlearning-based innovations such as large language models. Furthermore, the\nfuture research directions for TAMP is discussed in this survey, highlighting\nboth algorithmic and application-specific challenges.\n","authors":["Zhigen Zhao","Shuo Cheng","Yan Ding","Ziyi Zhou","Shiqi Zhang","Danfei Xu","Ye Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.02817v2.pdf","comment":"24 pages, 12 figures, submitted for review"},{"id":"http://arxiv.org/abs/2401.03629v2","updated":"2024-04-05T08:56:09Z","published":"2024-01-08T02:17:09Z","title":"DDM-Lag : A Diffusion-based Decision-making Model for Autonomous\n Vehicles with Lagrangian Safety Enhancement","summary":" Decision-making stands as a pivotal component in the realm of autonomous\nvehicles (AVs), playing a crucial role in navigating the intricacies of\nautonomous driving. Amidst the evolving landscape of data-driven methodologies,\nenhancing decision-making performance in complex scenarios has emerged as a\nprominent research focus. Despite considerable advancements, current\nlearning-based decision-making approaches exhibit potential for refinement,\nparticularly in aspects of policy articulation and safety assurance. To address\nthese challenges, we introduce DDM-Lag, a Diffusion Decision Model, augmented\nwith Lagrangian-based safety enhancements. This work conceptualizes the\nsequential decision-making challenge inherent in autonomous driving as a\nproblem of generative modeling, adopting diffusion models as the medium for\nassimilating patterns of decision-making. We introduce a hybrid policy update\nstrategy for diffusion models, amalgamating the principles of behavior cloning\nand Q-learning, alongside the formulation of an Actor-Critic architecture for\nthe facilitation of updates. To augment the model's exploration process with a\nlayer of safety, we incorporate additional safety constraints, employing a\nsophisticated policy optimization technique predicated on Lagrangian relaxation\nto refine the policy learning endeavor comprehensively. Empirical evaluation of\nour proposed decision-making methodology was conducted across a spectrum of\ndriving tasks, distinguished by their varying degrees of complexity and\nenvironmental contexts. The comparative analysis with established baseline\nmethodologies elucidates our model's superior performance, particularly in\ndimensions of safety and holistic efficacy.\n","authors":["Jiaqi Liu","Peng Hang","Xiaocong Zhao","Jianqiang Wang","Jian Sun"],"pdf_url":"https://arxiv.org/pdf/2401.03629v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03943v1","updated":"2024-04-05T08:17:03Z","published":"2024-04-05T08:17:03Z","title":"POMDP-Guided Active Force-Based Search for Robotic Insertion","summary":" In robotic insertion tasks where the uncertainty exceeds the allowable\ntolerance, a good search strategy is essential for successful insertion and\nsignificantly influences efficiency. The commonly used blind search method is\ntime-consuming and does not exploit the rich contact information. In this\npaper, we propose a novel search strategy that actively utilizes the\ninformation contained in the contact configuration and shows high efficiency.\nIn particular, we formulate this problem as a Partially Observable Markov\nDecision Process (POMDP) with carefully designed primitives based on an\nin-depth analysis of the contact configuration's static stability. From the\nformulated POMDP, we can derive a novel search strategy. Thanks to its\nsimplicity, this search strategy can be incorporated into a\nFinite-State-Machine (FSM) controller. The behaviors of the FSM controller are\nrealized through a low-level Cartesian Impedance Controller. Our method is\nbased purely on the robot's proprioceptive sensing and does not need visual or\ntactile sensors. To evaluate the effectiveness of our proposed strategy and\ncontrol framework, we conduct extensive comparison experiments in simulation,\nwhere we compare our method with the baseline approach. The results demonstrate\nthat our proposed method achieves a higher success rate with a shorter search\ntime and search trajectory length compared to the baseline method.\nAdditionally, we show that our method is robust to various initial displacement\nerrors.\n","authors":["Chen Wang","Haoxiang Luo","Kun Zhang","Hua Chen","Jia Pan","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.03943v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03940v1","updated":"2024-04-05T08:12:16Z","published":"2024-04-05T08:12:16Z","title":"Towards introspective loop closure in 4D radar SLAM","summary":" Imaging radar is an emerging sensor modality in the context of Localization\nand Mapping (SLAM), especially suitable for vision-obstructed environments.\nThis article investigates the use of 4D imaging radars for SLAM and analyzes\nthe challenges in robust loop closure. Previous work indicates that 4D radars,\ntogether with inertial measurements, offer ample information for accurate\nodometry estimation. However, the low field of view, limited resolution, and\nsparse and noisy measurements render loop closure a significantly more\nchallenging problem. Our work builds on the previous work - TBV SLAM - which\nwas proposed for robust loop closure with 360$^\\circ$ spinning radars. This\narticle highlights and addresses challenges inherited from a directional 4D\nradar, such as sparsity, noise, and reduced field of view, and discusses why\nthe common definition of a loop closure is unsuitable. By combining multiple\nquality measures for accurate loop closure detection adapted to 4D radar data,\nsignificant results in trajectory estimation are achieved; the absolute\ntrajectory error is as low as 0.46 m over a distance of 1.8 km, with consistent\noperation over multiple environments.\n","authors":["Maximilian Hilger","Vladimír Kubelka","Daniel Adolfsson","Henrik Andreasson","Achim J. Lilienthal"],"pdf_url":"https://arxiv.org/pdf/2404.03940v1.pdf","comment":"Submitted to the workshop \"Radar in Robotics: Resilience from Signal\n to Navigation\" at ICRA 2024"},{"id":"http://arxiv.org/abs/2309.15685v2","updated":"2024-04-05T08:00:39Z","published":"2023-09-27T14:31:47Z","title":"Improving Autonomous Driving Safety with POP: A Framework for Accurate\n Partially Observed Trajectory Predictions","summary":" Accurate trajectory prediction is crucial for safe and efficient autonomous\ndriving, but handling partial observations presents significant challenges. To\naddress this, we propose a novel trajectory prediction framework called Partial\nObservations Prediction (POP) for congested urban road scenarios. The framework\nconsists of two key stages: self-supervised learning (SSL) and feature\ndistillation. POP first employs SLL to help the model learn to reconstruct\nhistory representations, and then utilizes feature distillation as the\nfine-tuning task to transfer knowledge from the teacher model, which has been\npre-trained with complete observations, to the student model, which has only\nfew observations. POP achieves comparable results to top-performing methods in\nopen-loop experiments and outperforms the baseline method in closed-loop\nsimulations, including safety metrics. Qualitative results illustrate the\nsuperiority of POP in providing reasonable and safe trajectory predictions.\n","authors":["Sheng Wang","Yingbing Chen","Jie Cheng","Xiaodong Mei","Ren Xin","Yongkang Song","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2309.15685v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07759v4","updated":"2024-04-05T07:53:45Z","published":"2023-09-14T14:45:47Z","title":"PROGrasp: Pragmatic Human-Robot Communication for Object Grasping","summary":" Interactive Object Grasping (IOG) is the task of identifying and grasping the\ndesired object via human-robot natural language interaction. Current IOG\nsystems assume that a human user initially specifies the target object's\ncategory (e.g., bottle). Inspired by pragmatics, where humans often convey\ntheir intentions by relying on context to achieve goals, we introduce a new IOG\ntask, Pragmatic-IOG, and the corresponding dataset, Intention-oriented\nMulti-modal Dialogue (IM-Dial). In our proposed task scenario, an\nintention-oriented utterance (e.g., \"I am thirsty\") is initially given to the\nrobot. The robot should then identify the target object by interacting with a\nhuman user. Based on the task setup, we propose a new robotic system that can\ninterpret the user's intention and pick up the target object, Pragmatic Object\nGrasping (PROGrasp). PROGrasp performs Pragmatic-IOG by incorporating modules\nfor visual grounding, question asking, object grasping, and most importantly,\nanswer interpretation for pragmatic inference. Experimental results show that\nPROGrasp is effective in offline (i.e., target object discovery) and online\n(i.e., IOG with a physical robot arm) settings. Code and data are available at\nhttps://github.com/gicheonkang/prograsp.\n","authors":["Gi-Cheon Kang","Junghyun Kim","Jaein Kim","Byoung-Tak Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.07759v4.pdf","comment":"ICRA 2024"},{"id":"http://arxiv.org/abs/2404.03911v1","updated":"2024-04-05T06:29:16Z","published":"2024-04-05T06:29:16Z","title":"Under-Canopy Navigation using Aerial Lidar Maps","summary":" Autonomous navigation in unstructured natural environments poses a\nsignificant challenge. In goal navigation tasks without prior information, the\nlimited look-ahead of onboard sensors utilised by robots compromises path\nefficiency. We propose a novel approach that leverages an above-the-canopy\naerial map for improved ground robot navigation. Our system utilises aerial\nlidar scans to create a 3D probabilistic occupancy map, uniquely incorporating\nthe uncertainty in the aerial vehicle's trajectory for improved accuracy. Novel\npath planning cost functions are introduced, combining path length with\nobstruction risk estimated from the probabilistic map. The D-Star Lite\nalgorithm then calculates an optimal (minimum-cost) path to the goal. This\nsystem also allows for dynamic replanning upon encountering unforeseen\nobstacles on the ground. Extensive experiments and ablation studies in\nsimulated and real forests demonstrate the effectiveness of our system.\n","authors":["Lucas Carvalho de Lima","Nicholas Lawrance","Kasra Khosoussi","Paulo Borges","Michael Bruenig"],"pdf_url":"https://arxiv.org/pdf/2404.03911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03891v1","updated":"2024-04-05T04:58:34Z","published":"2024-04-05T04:58:34Z","title":"Can only LLMs do Reasoning?: Potential of Small Language Models in Task\n Planning","summary":" In robotics, the use of Large Language Models (LLMs) is becoming prevalent,\nespecially for understanding human commands. In particular, LLMs are utilized\nas domain-agnostic task planners for high-level human commands. LLMs are\ncapable of Chain-of-Thought (CoT) reasoning, and this allows LLMs to be task\nplanners. However, we need to consider that modern robots still struggle to\nperform complex actions, and the domains where robots can be deployed are\nlimited in practice. This leads us to pose a question: If small LMs can be\ntrained to reason in chains within a single domain, would even small LMs be\ngood task planners for the robots? To train smaller LMs to reason in chains, we\nbuild `COmmand-STeps datasets' (COST) consisting of high-level commands along\nwith corresponding actionable low-level steps, via LLMs. We release not only\nour datasets but also the prompt templates used to generate them, to allow\nanyone to build datasets for their domain. We compare GPT3.5 and GPT4 with the\nfinetuned GPT2 for task domains, in tabletop and kitchen environments, and the\nresult shows that GPT2-medium is comparable to GPT3.5 for task planning in a\nspecific domain. Our dataset, code, and more output samples can be found in\nhttps://github.com/Gawon-Choi/small-LMs-Task-Planning\n","authors":["Gawon Choi","Hyemin Ahn"],"pdf_url":"https://arxiv.org/pdf/2404.03891v1.pdf","comment":"8 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.03869v1","updated":"2024-04-05T03:02:57Z","published":"2024-04-05T03:02:57Z","title":"Heterogeneous Multi-Agent Reinforcement Learning for Zero-Shot Scalable\n Collaboration","summary":" The rise of multi-agent systems, especially the success of multi-agent\nreinforcement learning (MARL), is reshaping our future across diverse domains\nlike autonomous vehicle networks. However, MARL still faces significant\nchallenges, particularly in achieving zero-shot scalability, which allows\ntrained MARL models to be directly applied to unseen tasks with varying numbers\nof agents. In addition, real-world multi-agent systems usually contain agents\nwith different functions and strategies, while the existing scalable MARL\nmethods only have limited heterogeneity. To address this, we propose a novel\nMARL framework named Scalable and Heterogeneous Proximal Policy Optimization\n(SHPPO), integrating heterogeneity into parameter-shared PPO-based MARL\nnetworks. we first leverage a latent network to adaptively learn strategy\npatterns for each agent. Second, we introduce a heterogeneous layer for\ndecision-making, whose parameters are specifically generated by the learned\nlatent variables. Our approach is scalable as all the parameters are shared\nexcept for the heterogeneous layer, and gains both inter-individual and\ntemporal heterogeneity at the same time. We implement our approach based on the\nstate-of-the-art backbone PPO-based algorithm as SHPPO, while our approach is\nagnostic to the backbone and can be seamlessly plugged into any\nparameter-shared MARL method. SHPPO exhibits superior performance over the\nbaselines such as MAPPO and HAPPO in classic MARL environments like Starcraft\nMulti-Agent Challenge (SMAC) and Google Research Football (GRF), showcasing\nenhanced zero-shot scalability and offering insights into the learned latent\nrepresentation's impact on team performance by visualization.\n","authors":["Xudong Guo","Daming Shi","Junjie Yu","Wenhui Fan"],"pdf_url":"https://arxiv.org/pdf/2404.03869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12686v3","updated":"2024-04-05T02:34:01Z","published":"2024-03-19T12:45:18Z","title":"WaterVG: Waterway Visual Grounding based on Text-Guided Vision and\n mmWave Radar","summary":" The perception of waterways based on human intent is significant for\nautonomous navigation and operations of Unmanned Surface Vehicles (USVs) in\nwater environments. Inspired by visual grounding, we introduce WaterVG, the\nfirst visual grounding dataset designed for USV-based waterway perception based\non human prompts. WaterVG encompasses prompts describing multiple targets, with\nannotations at the instance level including bounding boxes and masks. Notably,\nWaterVG includes 11,568 samples with 34,987 referred targets, whose prompts\nintegrates both visual and radar characteristics. The pattern of text-guided\ntwo sensors equips a finer granularity of text prompts with visual and radar\nfeatures of referred targets. Moreover, we propose a low-power visual grounding\nmodel, Potamoi, which is a multi-task model with a well-designed Phased\nHeterogeneous Modality Fusion (PHMF) mode, including Adaptive Radar Weighting\n(ARW) and Multi-Head Slim Cross Attention (MHSCA). Exactly, ARW extracts\nrequired radar features to fuse with vision for prompt alignment. MHSCA is an\nefficient fusion module with a remarkably small parameter count and FLOPs,\nelegantly fusing scenario context captured by two sensors with linguistic\nfeatures, which performs expressively on visual grounding tasks. Comprehensive\nexperiments and evaluations have been conducted on WaterVG, where our Potamoi\narchives state-of-the-art performances compared with counterparts.\n","authors":["Runwei Guan","Liye Jia","Fengyufan Yang","Shanliang Yao","Erick Purwanto","Xiaohui Zhu","Eng Gee Lim","Jeremy Smith","Ka Lok Man","Xuming Hu","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2403.12686v3.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.03843v1","updated":"2024-04-05T00:25:37Z","published":"2024-04-05T00:25:37Z","title":"Scaling Motion Forecasting Models with Ensemble Distillation","summary":" Motion forecasting has become an increasingly critical component of\nautonomous robotic systems. Onboard compute budgets typically limit the\naccuracy of real-time systems. In this work we propose methods of improving\nmotion forecasting systems subject to limited compute budgets by combining\nmodel ensemble and distillation techniques. The use of ensembles of deep neural\nnetworks has been shown to improve generalization accuracy in many application\ndomains. We first demonstrate significant performance gains by creating a large\nensemble of optimized single models. We then develop a generalized framework to\ndistill motion forecasting model ensembles into small student models which\nretain high performance with a fraction of the computing cost. For this study\nwe focus on the task of motion forecasting using real world data from\nautonomous driving systems. We develop ensemble models that are very\ncompetitive on the Waymo Open Motion Dataset (WOMD) and Argoverse leaderboards.\nFrom these ensembles, we train distilled student models which have high\nperformance at a fraction of the compute costs. These experiments demonstrate\ndistillation from ensembles as an effective method for improving accuracy of\npredictive models for robotic systems with limited compute budgets.\n","authors":["Scott Ettinger","Kratarth Goel","Avikalp Srivastava","Rami Al-Rfou"],"pdf_url":"https://arxiv.org/pdf/2404.03843v1.pdf","comment":"11 pages, 14 figures"},{"id":"http://arxiv.org/abs/2404.00938v2","updated":"2024-04-05T21:55:47Z","published":"2024-04-01T05:50:56Z","title":"How Can Large Language Models Enable Better Socially Assistive\n Human-Robot Interaction: A Brief Survey","summary":" Socially assistive robots (SARs) have shown great success in providing\npersonalized cognitive-affective support for user populations with special\nneeds such as older adults, children with autism spectrum disorder (ASD), and\nindividuals with mental health challenges. The large body of work on SAR\ndemonstrates its potential to provide at-home support that complements\nclinic-based interventions delivered by mental health professionals, making\nthese interventions more effective and accessible. However, there are still\nseveral major technical challenges that hinder SAR-mediated interactions and\ninterventions from reaching human-level social intelligence and efficacy. With\nthe recent advances in large language models (LLMs), there is an increased\npotential for novel applications within the field of SAR that can significantly\nexpand the current capabilities of SARs. However, incorporating LLMs introduces\nnew risks and ethical concerns that have not yet been encountered, and must be\ncarefully be addressed to safely deploy these more advanced systems. In this\nwork, we aim to conduct a brief survey on the use of LLMs in SAR technologies,\nand discuss the potentials and risks of applying LLMs to the following three\nmajor technical challenges of SAR: 1) natural language dialog; 2) multimodal\nunderstanding; 3) LLMs as robot policies.\n","authors":["Zhonghao Shi","Ellen Landrum","Amy O' Connell","Mina Kian","Leticia Pinto-Alva","Kaleen Shrestha","Xiaoyuan Zhu","Maja J Matarić"],"pdf_url":"https://arxiv.org/pdf/2404.00938v2.pdf","comment":"2 pages, accepted to the Proceedings of the AAAI Symposium Series,\n 2024"},{"id":"http://arxiv.org/abs/2404.04419v1","updated":"2024-04-05T21:38:38Z","published":"2024-04-05T21:38:38Z","title":"Hybrid Force Motion Control with Estimated Surface Normal for\n Manufacturing Applications","summary":" This paper proposes a hybrid force-motion framework that utilizes real-time\nsurface normal updates. The surface normal is estimated via a novel method that\nleverages force sensing measurements and velocity commands to compensate the\nfriction bias. This approach is critical for robust execution of precision\nforce-controlled tasks in manufacturing, such as thermoplastic tape replacement\nthat traces surfaces or paths on a workpiece subject to uncertainties deviated\nfrom the model. We formulated the proposed method and implemented the framework\nin ROS2 environment. The approach was validated using kinematic simulations and\na hardware platform. Specifically, we demonstrated the approach on a 7-DoF\nmanipulator equipped with a force/torque sensor at the end-effector.\n","authors":["Ehsan Nasiri","Long Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04419v1.pdf","comment":"8 pages, 21st International Conference on Ubiquitous Robots (UR\n 2024), accepted"},{"id":"http://arxiv.org/abs/2404.04416v1","updated":"2024-04-05T21:31:07Z","published":"2024-04-05T21:31:07Z","title":"Admittance Control for Adaptive Remote Center of Motion in Robotic\n Laparoscopic Surgery","summary":" In laparoscopic robot-assisted minimally invasive surgery, the kinematic\ncontrol of the robot is subject to the remote center of motion (RCM) constraint\nat the port of entry (e.g., trocar) into the patient's body. During surgery,\nafter the instrument is inserted through the trocar, intrinsic physiological\nmovements such as the patient's heartbeat, breathing process, and/or other\npurposeful body repositioning may deviate the position of the port of entry.\nThis can cause a conflict between the registered RCM and the moved port of\nentry.\n To mitigate this conflict, we seek to utilize the interaction forces at the\nRCM. We develop a novel framework that integrates admittance control into a\nredundancy resolution method for the RCM kinematic constraint. Using the\nforce/torque sensory feedback at the base of the instrument driving mechanism\n(IDM), the proposed framework estimates the forces at RCM, rejects forces\napplied on other locations along the instrument, and uses them in the\nadmittance controller. In this paper, we report analysis from kinematic\nsimulations to validate the proposed framework. In addition, a hardware\nplatform has been completed, and future work is planned for experimental\nvalidation.\n","authors":["Ehsan Nasiri","Long Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04416v1.pdf","comment":"7 pages, 21st International Conference on Ubiquitous Robots (UR\n 2024), accepted"},{"id":"http://arxiv.org/abs/2404.04404v1","updated":"2024-04-05T21:01:10Z","published":"2024-04-05T21:01:10Z","title":"A Ground Mobile Robot for Autonomous Terrestrial Laser Scanning-Based\n Field Phenotyping","summary":" Traditional field phenotyping methods are often manual, time-consuming, and\ndestructive, posing a challenge for breeding progress. To address this\nbottleneck, robotics and automation technologies offer efficient sensing tools\nto monitor field evolution and crop development throughout the season. This\nstudy aimed to develop an autonomous ground robotic system for LiDAR-based\nfield phenotyping in plant breeding trials. A Husky platform was equipped with\na high-resolution three-dimensional (3D) laser scanner to collect in-field\nterrestrial laser scanning (TLS) data without human intervention. To automate\nthe TLS process, a 3D ray casting analysis was implemented for optimal TLS site\nplanning, and a route optimization algorithm was utilized to minimize travel\ndistance during data collection. The platform was deployed in two cotton\nbreeding fields for evaluation, where it autonomously collected TLS data. The\nsystem provided accurate pose information through RTK-GNSS positioning and\nsensor fusion techniques, with average errors of less than 0.6 cm for location\nand 0.38$^{\\circ}$ for heading. The achieved localization accuracy allowed\npoint cloud registration with mean point errors of approximately 2 cm,\ncomparable to traditional TLS methods that rely on artificial targets and\nmanual sensor deployment. This work presents an autonomous phenotyping platform\nthat facilitates the quantitative assessment of plant traits under field\nconditions of both large agricultural fields and small breeding trials to\ncontribute to the advancement of plant phenomics and breeding programs.\n","authors":["Javier Rodriguez-Sanchez","Kyle Johnsen","Changying Li"],"pdf_url":"https://arxiv.org/pdf/2404.04404v1.pdf","comment":"Submitted to Journal of Field Robotics"},{"id":"http://arxiv.org/abs/2404.04377v1","updated":"2024-04-05T19:42:55Z","published":"2024-04-05T19:42:55Z","title":"LOSS-SLAM: Lightweight Open-Set Semantic Simultaneous Localization and\n Mapping","summary":" Enabling robots to understand the world in terms of objects is a critical\nbuilding block towards higher level autonomy. The success of foundation models\nin vision has created the ability to segment and identify nearly all objects in\nthe world. However, utilizing such objects to localize the robot and build an\nopen-set semantic map of the world remains an open research question. In this\nwork, a system of identifying, localizing, and encoding objects is tightly\ncoupled with probabilistic graphical models for performing open-set semantic\nsimultaneous localization and mapping (SLAM). Results are presented\ndemonstrating that the proposed lightweight object encoding can be used to\nperform more accurate object-based SLAM than existing open-set methods,\nclosed-set methods, and geometric methods while incurring a lower computational\noverhead than existing open-set mapping methods.\n","authors":["Kurran Singh","Tim Magoun","John J. Leonard"],"pdf_url":"https://arxiv.org/pdf/2404.04377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13421v3","updated":"2024-04-05T18:23:38Z","published":"2024-03-20T09:07:23Z","title":"Caching-Augmented Lifelong Multi-Agent Path Finding","summary":" Multi-Agent Path Finding (MAPF), which involves finding collision-free paths\nfor multiple robots, is crucial in various applications. Lifelong MAPF, where\ntargets are reassigned to agents as soon as they complete their initial\ntargets, offers a more accurate approximation of real-world warehouse planning.\nIn this paper, we present a novel mechanism named Caching-Augmented Lifelong\nMAPF (CAL-MAPF), designed to improve the performance of Lifelong MAPF. We have\ndeveloped a new type of map grid called cache for temporary item storage and\nreplacement, and created a locking mechanism to improve the planning solution's\nstability. A task assigner (TA) is designed for CAL-MAPF to allocate target\nlocations to agents and control agent status in different situations. CAL-MAPF\nhas been evaluated using various cache replacement policies and input task\ndistributions. We have identified three main factors significantly impacting\nCAL-MAPF performance through experimentation: suitable input task distribution,\nhigh cache hit rate, and smooth traffic. In general, CAL-MAPF has demonstrated\npotential for performance improvements in certain task distributions, map and\nagent configurations.\n","authors":["Yimin Tang","Zhenghong Yu","Yi Zheng","T. K. Satish Kumar","Jiaoyang Li","Sven Koenig"],"pdf_url":"https://arxiv.org/pdf/2403.13421v3.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.00086v2","updated":"2024-04-05T17:59:50Z","published":"2024-03-29T17:58:50Z","title":"DVIS-DAQ: Improving Video Segmentation via Dynamic Anchor Queries","summary":" Modern video segmentation methods adopt object queries to perform inter-frame\nassociation and demonstrate satisfactory performance in tracking continuously\nappearing objects despite large-scale motion and transient occlusion. However,\nthey all underperform on newly emerging and disappearing objects that are\ncommon in the real world because they attempt to model object emergence and\ndisappearance through feature transitions between background and foreground\nqueries that have significant feature gaps. We introduce Dynamic Anchor Queries\n(DAQ) to shorten the transition gap between the anchor and target queries by\ndynamically generating anchor queries based on the features of potential\ncandidates. Furthermore, we introduce a query-level object Emergence and\nDisappearance Simulation (EDS) strategy, which unleashes DAQ's potential\nwithout any additional cost. Finally, we combine our proposed DAQ and EDS with\nDVIS to obtain DVIS-DAQ. Extensive experiments demonstrate that DVIS-DAQ\nachieves a new state-of-the-art (SOTA) performance on five mainstream video\nsegmentation benchmarks. Code and models are available at\n\\url{https://github.com/SkyworkAI/DAQ-VS}.\n","authors":["Yikang Zhou","Tao Zhang","Shunping Ji","Shuicheng Yan","Xiangtai Li"],"pdf_url":"https://arxiv.org/pdf/2404.00086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04256v1","updated":"2024-04-05T17:59:44Z","published":"2024-04-05T17:59:44Z","title":"Sigma: Siamese Mamba Network for Multi-Modal Semantic Segmentation","summary":" Multi-modal semantic segmentation significantly enhances AI agents'\nperception and scene understanding, especially under adverse conditions like\nlow-light or overexposed environments. Leveraging additional modalities\n(X-modality) like thermal and depth alongside traditional RGB provides\ncomplementary information, enabling more robust and reliable segmentation. In\nthis work, we introduce Sigma, a Siamese Mamba network for multi-modal semantic\nsegmentation, utilizing the Selective Structured State Space Model, Mamba.\nUnlike conventional methods that rely on CNNs, with their limited local\nreceptive fields, or Vision Transformers (ViTs), which offer global receptive\nfields at the cost of quadratic complexity, our model achieves global receptive\nfields coverage with linear complexity. By employing a Siamese encoder and\ninnovating a Mamba fusion mechanism, we effectively select essential\ninformation from different modalities. A decoder is then developed to enhance\nthe channel-wise modeling ability of the model. Our method, Sigma, is\nrigorously evaluated on both RGB-Thermal and RGB-Depth segmentation tasks,\ndemonstrating its superiority and marking the first successful application of\nState Space Models (SSMs) in multi-modal perception tasks. Code is available at\nhttps://github.com/zifuwan/Sigma.\n","authors":["Zifu Wan","Yuhao Wang","Silong Yong","Pingping Zhang","Simon Stepputtis","Katia Sycara","Yaqi Xie"],"pdf_url":"https://arxiv.org/pdf/2404.04256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04254v1","updated":"2024-04-05T17:58:52Z","published":"2024-04-05T17:58:52Z","title":"Watermark-based Detection and Attribution of AI-Generated Content","summary":" Several companies--such as Google, Microsoft, and OpenAI--have deployed\ntechniques to watermark AI-generated content to enable proactive detection.\nHowever, existing literature mainly focuses on user-agnostic detection.\nAttribution aims to further trace back the user of a generative-AI service who\ngenerated a given content detected as AI-generated. Despite its growing\nimportance, attribution is largely unexplored. In this work, we aim to bridge\nthis gap by providing the first systematic study on watermark-based, user-aware\ndetection and attribution of AI-generated content. Specifically, we\ntheoretically study the detection and attribution performance via rigorous\nprobabilistic analysis. Moreover, we develop an efficient algorithm to select\nwatermarks for the users to enhance attribution performance. Both our\ntheoretical and empirical results show that watermark-based detection and\nattribution inherit the accuracy and (non-)robustness properties of the\nwatermarking method.\n","authors":["Zhengyuan Jiang","Moyang Guo","Yuepeng Hu","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2404.04254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04251v1","updated":"2024-04-05T17:57:16Z","published":"2024-04-05T17:57:16Z","title":"Who Evaluates the Evaluations? Objectively Scoring Text-to-Image Prompt\n Coherence Metrics with T2IScoreScore (TS2)","summary":" With advances in the quality of text-to-image (T2I) models has come interest\nin benchmarking their prompt faithfulness-the semantic coherence of generated\nimages to the prompts they were conditioned on. A variety of T2I faithfulness\nmetrics have been proposed, leveraging advances in cross-modal embeddings and\nvision-language models (VLMs). However, these metrics are not rigorously\ncompared and benchmarked, instead presented against few weak baselines by\ncorrelation to human Likert scores over a set of easy-to-discriminate images.\n We introduce T2IScoreScore (TS2), a curated set of semantic error graphs\ncontaining a prompt and a set increasingly erroneous images. These allow us to\nrigorously judge whether a given prompt faithfulness metric can correctly order\nimages with respect to their objective error count and significantly\ndiscriminate between different error nodes, using meta-metric scores derived\nfrom established statistical tests. Surprisingly, we find that the\nstate-of-the-art VLM-based metrics (e.g., TIFA, DSG, LLMScore, VIEScore) we\ntested fail to significantly outperform simple feature-based metrics like\nCLIPScore, particularly on a hard subset of naturally-occurring T2I model\nerrors. TS2 will enable the development of better T2I prompt faithfulness\nmetrics through more rigorous comparison of their conformity to expected\norderings and separations under objective criteria.\n","authors":["Michael Saxon","Fatima Jahara","Mahsa Khoshnoodi","Yujie Lu","Aditya Sharma","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04251v1.pdf","comment":"15 pages main, 9 pages appendices, 16 figures, 3 tables"},{"id":"http://arxiv.org/abs/2312.08240v2","updated":"2024-04-05T17:56:12Z","published":"2023-12-13T16:01:50Z","title":"CenterGrasp: Object-Aware Implicit Representation Learning for\n Simultaneous Shape Reconstruction and 6-DoF Grasp Estimation","summary":" Reliable object grasping is a crucial capability for autonomous robots.\nHowever, many existing grasping approaches focus on general clutter removal\nwithout explicitly modeling objects and thus only relying on the visible local\ngeometry. We introduce CenterGrasp, a novel framework that combines object\nawareness and holistic grasping. CenterGrasp learns a general object prior by\nencoding shapes and valid grasps in a continuous latent space. It consists of\nan RGB-D image encoder that leverages recent advances to detect objects and\ninfer their pose and latent code, and a decoder to predict shape and grasps for\neach object in the scene. We perform extensive experiments on simulated as well\nas real-world cluttered scenes and demonstrate strong scene reconstruction and\n6-DoF grasp-pose estimation performance. Compared to the state of the art,\nCenterGrasp achieves an improvement of 38.5 mm in shape reconstruction and 33\npercentage points on average in grasp success. We make the code and trained\nmodels publicly available at http://centergrasp.cs.uni-freiburg.de.\n","authors":["Eugenio Chisari","Nick Heppert","Tim Welschehold","Wolfram Burgard","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2312.08240v2.pdf","comment":"Accepted at RA-L. Video, code and models available at\n http://centergrasp.cs.uni-freiburg.de"},{"id":"http://arxiv.org/abs/2102.05984v2","updated":"2024-04-05T17:55:28Z","published":"2021-02-11T13:04:49Z","title":"Modeling 3D Surface Manifolds with a Locally Conditioned Atlas","summary":" Recently proposed 3D object reconstruction methods represent a mesh with an\natlas - a set of planar patches approximating the surface. However, their\napplication in a real-world scenario is limited since the surfaces of\nreconstructed objects contain discontinuities, which degrades the quality of\nthe final mesh. This is mainly caused by independent processing of individual\npatches, and in this work, we postulate to mitigate this limitation by\npreserving local consistency around patch vertices. To that end, we introduce a\nLocally Conditioned Atlas (LoCondA), a framework for representing a 3D object\nhierarchically in a generative model. Firstly, the model maps a point cloud of\nan object into a sphere. Secondly, by leveraging a spherical prior, we enforce\nthe mapping to be locally consistent on the sphere and on the target object.\nThis way, we can sample a mesh quad on that sphere and project it back onto the\nobject's manifold. With LoCondA, we can produce topologically diverse objects\nwhile maintaining quads to be stitched together. We show that the proposed\napproach provides structurally coherent reconstructions while producing meshes\nof quality comparable to the competitors.\n","authors":["Przemysław Spurek","Sebastian Winczowski","Maciej Zięba","Tomasz Trzciński","Kacper Kania","Marcin Mazur"],"pdf_url":"https://arxiv.org/pdf/2102.05984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04245v1","updated":"2024-04-05T17:51:58Z","published":"2024-04-05T17:51:58Z","title":"Evaluating Adversarial Robustness: A Comparison Of FGSM, Carlini-Wagner\n Attacks, And The Role of Distillation as Defense Mechanism","summary":" This technical report delves into an in-depth exploration of adversarial\nattacks specifically targeted at Deep Neural Networks (DNNs) utilized for image\nclassification. The study also investigates defense mechanisms aimed at\nbolstering the robustness of machine learning models. The research focuses on\ncomprehending the ramifications of two prominent attack methodologies: the Fast\nGradient Sign Method (FGSM) and the Carlini-Wagner (CW) approach. These attacks\nare examined concerning three pre-trained image classifiers: Resnext50_32x4d,\nDenseNet-201, and VGG-19, utilizing the Tiny-ImageNet dataset. Furthermore, the\nstudy proposes the robustness of defensive distillation as a defense mechanism\nto counter FGSM and CW attacks. This defense mechanism is evaluated using the\nCIFAR-10 dataset, where CNN models, specifically resnet101 and Resnext50_32x4d,\nserve as the teacher and student models, respectively. The proposed defensive\ndistillation model exhibits effectiveness in thwarting attacks such as FGSM.\nHowever, it is noted to remain susceptible to more sophisticated techniques\nlike the CW attack. The document presents a meticulous validation of the\nproposed scheme. It provides detailed and comprehensive results, elucidating\nthe efficacy and limitations of the defense mechanisms employed. Through\nrigorous experimentation and analysis, the study offers insights into the\ndynamics of adversarial attacks on DNNs, as well as the effectiveness of\ndefensive strategies in mitigating their impact.\n","authors":["Trilokesh Ranjan Sarkar","Nilanjan Das","Pralay Sankar Maitra","Bijoy Some","Ritwik Saha","Orijita Adhikary","Bishal Bose","Jaydip Sen"],"pdf_url":"https://arxiv.org/pdf/2404.04245v1.pdf","comment":"This report pertains to the Capstone Project done by Group 1 of the\n Fall batch of 2023 students at Praxis Tech School, Kolkata, India. The\n reports consists of 35 pages and it includes 15 figures and 10 tables. This\n is the preprint which will be submitted to to an IEEE international\n conference for review"},{"id":"http://arxiv.org/abs/2404.04244v1","updated":"2024-04-05T17:46:38Z","published":"2024-04-05T17:46:38Z","title":"DiffOp-net: A Differential Operator-based Fully Convolutional Network\n for Unsupervised Deformable Image Registration","summary":" Existing unsupervised deformable image registration methods usually rely on\nmetrics applied to the gradients of predicted displacement or velocity fields\nas a regularization term to ensure transformation smoothness, which potentially\nlimits registration accuracy. In this study, we propose a novel approach to\nenhance unsupervised deformable image registration by introducing a new\ndifferential operator into the registration framework. This operator, acting on\nthe velocity field and mapping it to a dual space, ensures the smoothness of\nthe velocity field during optimization, facilitating accurate deformable\nregistration. In addition, to tackle the challenge of capturing large\ndeformations inside image pairs, we introduce a Cross-Coordinate Attention\nmodule (CCA) and embed it into a proposed Fully Convolutional Networks\n(FCNs)-based multi-resolution registration architecture. Evaluation experiments\nare conducted on two magnetic resonance imaging (MRI) datasets. Compared to\nvarious state-of-the-art registration approaches, including a traditional\nalgorithm and three representative unsupervised learning-based methods, our\nmethod achieves superior accuracies, maintaining desirable diffeomorphic\nproperties, and exhibiting promising registration speed.\n","authors":["Jiong Wu"],"pdf_url":"https://arxiv.org/pdf/2404.04244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04243v1","updated":"2024-04-05T17:45:22Z","published":"2024-04-05T17:45:22Z","title":"Identity Decoupling for Multi-Subject Personalization of Text-to-Image\n Models","summary":" Text-to-image diffusion models have shown remarkable success in generating a\npersonalized subject based on a few reference images. However, current methods\nstruggle with handling multiple subjects simultaneously, often resulting in\nmixed identities with combined attributes from different subjects. In this\nwork, we present MuDI, a novel framework that enables multi-subject\npersonalization by effectively decoupling identities from multiple subjects.\nOur main idea is to utilize segmented subjects generated by the Segment\nAnything Model for both training and inference, as a form of data augmentation\nfor training and initialization for the generation process. Our experiments\ndemonstrate that MuDI can produce high-quality personalized images without\nidentity mixing, even for highly similar subjects as shown in Figure 1. In\nhuman evaluation, MuDI shows twice as many successes for personalizing multiple\nsubjects without identity mixing over existing baselines and is preferred over\n70% compared to the strongest baseline. More results are available at\nhttps://mudi-t2i.github.io/.\n","authors":["Sangwon Jang","Jaehyeong Jo","Kimin Lee","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2404.04243v1.pdf","comment":"Preprint. Project page: https://mudi-t2i.github.io/"},{"id":"http://arxiv.org/abs/2404.04242v1","updated":"2024-04-05T17:45:07Z","published":"2024-04-05T17:45:07Z","title":"Physical Property Understanding from Language-Embedded Feature Fields","summary":" Can computers perceive the physical properties of objects solely through\nvision? Research in cognitive science and vision science has shown that humans\nexcel at identifying materials and estimating their physical properties based\npurely on visual appearance. In this paper, we present a novel approach for\ndense prediction of the physical properties of objects using a collection of\nimages. Inspired by how humans reason about physics through vision, we leverage\nlarge language models to propose candidate materials for each object. We then\nconstruct a language-embedded point cloud and estimate the physical properties\nof each 3D point using a zero-shot kernel regression approach. Our method is\naccurate, annotation-free, and applicable to any object in the open world.\nExperiments demonstrate the effectiveness of the proposed approach in various\nphysical property reasoning tasks, such as estimating the mass of common\nobjects, as well as other properties like friction and hardness.\n","authors":["Albert J. Zhai","Yuan Shen","Emily Y. Chen","Gloria X. Wang","Xinlei Wang","Sheng Wang","Kaiyu Guan","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04242v1.pdf","comment":"CVPR 2024. Project page (with code):\n https://ajzhai.github.io/NeRF2Physics/"},{"id":"http://arxiv.org/abs/2311.08577v3","updated":"2024-04-05T17:37:36Z","published":"2023-11-14T22:46:01Z","title":"Finding AI-Generated Faces in the Wild","summary":" AI-based image generation has continued to rapidly improve, producing\nincreasingly more realistic images with fewer obvious visual flaws.\nAI-generated images are being used to create fake online profiles which in turn\nare being used for spam, fraud, and disinformation campaigns. As the general\nproblem of detecting any type of manipulated or synthesized content is\nreceiving increasing attention, here we focus on a more narrow task of\ndistinguishing a real face from an AI-generated face. This is particularly\napplicable when tackling inauthentic online accounts with a fake user profile\nphoto. We show that by focusing on only faces, a more resilient and\ngeneral-purpose artifact can be detected that allows for the detection of\nAI-generated faces from a variety of GAN- and diffusion-based synthesis\nengines, and across image resolutions (as low as 128 x 128 pixels) and\nqualities.\n","authors":["Gonzalo J. Aniano Porcile","Jack Gindi","Shivansh Mundra","James R. Verbus","Hany Farid"],"pdf_url":"https://arxiv.org/pdf/2311.08577v3.pdf","comment":"to be published as: G.J.A. Porcile, J. Gindi, S. Mundra, J.R. Verbus,\n and H. Farid, Finding AI-Generated Faces in the Wild, Workshop on Media\n Forensics at CVPR, 2024"},{"id":"http://arxiv.org/abs/2404.03635v2","updated":"2024-04-05T17:27:34Z","published":"2024-04-04T17:54:33Z","title":"WorDepth: Variational Language Prior for Monocular Depth Estimation","summary":" Three-dimensional (3D) reconstruction from a single image is an ill-posed\nproblem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text\ndescription(s) is similarly ill-posed, i.e. spatial arrangements of objects\ndescribed. We investigate the question of whether two inherently ambiguous\nmodalities can be used in conjunction to produce metric-scaled reconstructions.\nTo test this, we focus on monocular depth estimation, the problem of predicting\na dense depth map from a single image, but with an additional text caption\ndescribing the scene. To this end, we begin by encoding the text caption as a\nmean and standard deviation; using a variational framework, we learn the\ndistribution of the plausible metric reconstructions of 3D scenes corresponding\nto the text captions as a prior. To \"select\" a specific reconstruction or depth\nmap, we encode the given image through a conditional sampler that samples from\nthe latent space of the variational text encoder, which is then decoded to the\noutput depth map. Our approach is trained alternatingly between the text and\nimage branches: in one optimization step, we predict the mean and standard\ndeviation from the text description and sample from a standard Gaussian, and in\nthe other, we sample using a (image) conditional sampler. Once trained, we\ndirectly predict depth from the encoded text using the conditional sampler. We\ndemonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where\nwe show that language can consistently improve performance in both.\n","authors":["Ziyao Zeng","Daniel Wang","Fengyu Yang","Hyoungseob Park","Yangchao Wu","Stefano Soatto","Byung-Woo Hong","Dong Lao","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2404.03635v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04231v1","updated":"2024-04-05T17:25:17Z","published":"2024-04-05T17:25:17Z","title":"Image-Text Co-Decomposition for Text-Supervised Semantic Segmentation","summary":" This paper addresses text-supervised semantic segmentation, aiming to learn a\nmodel capable of segmenting arbitrary visual concepts within images by using\nonly image-text pairs without dense annotations. Existing methods have\ndemonstrated that contrastive learning on image-text pairs effectively aligns\nvisual segments with the meanings of texts. We notice that there is a\ndiscrepancy between text alignment and semantic segmentation: A text often\nconsists of multiple semantic concepts, whereas semantic segmentation strives\nto create semantically homogeneous segments. To address this issue, we propose\na novel framework, Image-Text Co-Decomposition (CoDe), where the paired image\nand text are jointly decomposed into a set of image regions and a set of word\nsegments, respectively, and contrastive learning is developed to enforce\nregion-word alignment. To work with a vision-language model, we present a\nprompt learning mechanism that derives an extra representation to highlight an\nimage segment or a word segment of interest, with which more effective features\ncan be extracted from that segment. Comprehensive experimental results\ndemonstrate that our method performs favorably against existing text-supervised\nsemantic segmentation methods on six benchmark datasets.\n","authors":["Ji-Jia Wu","Andy Chia-Hao Chang","Chieh-Yu Chuang","Chun-Pei Chen","Yu-Lun Liu","Min-Hung Chen","Hou-Ning Hu","Yung-Yu Chuang","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2404.04231v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02257v2","updated":"2024-04-05T17:02:31Z","published":"2024-04-02T19:25:04Z","title":"SnAG: Scalable and Accurate Video Grounding","summary":" Temporal grounding of text descriptions in videos is a central problem in\nvision-language learning and video understanding. Existing methods often\nprioritize accuracy over scalability -- they have been optimized for grounding\nonly a few text queries within short videos, and fail to scale up to long\nvideos with hundreds of queries. In this paper, we study the effect of\ncross-modal fusion on the scalability of video grounding models. Our analysis\nestablishes late fusion as a more cost-effective fusion scheme for long-form\nvideos with many text queries. Moreover, it leads us to a novel, video-centric\nsampling scheme for efficient training. Based on these findings, we present\nSnAG, a simple baseline for scalable and accurate video grounding. Without\nbells and whistles, SnAG is 43% more accurate and 1.5x faster than CONE, a\nstate of the art for long-form video grounding on the challenging MAD dataset,\nwhile achieving highly competitive results on short videos.\n","authors":["Fangzhou Mu","Sicheng Mo","Yin Li"],"pdf_url":"https://arxiv.org/pdf/2404.02257v2.pdf","comment":"Accepted to CVPR 2024. Code available at\n https://github.com/fmu2/snag_release"},{"id":"http://arxiv.org/abs/2402.15584v2","updated":"2024-04-05T17:01:34Z","published":"2024-02-23T19:51:55Z","title":"State Space Models for Event Cameras","summary":" Today, state-of-the-art deep neural networks that process event-camera data\nfirst convert a temporal window of events into dense, grid-like input\nrepresentations. As such, they exhibit poor generalizability when deployed at\nhigher inference frequencies (i.e., smaller temporal windows) than the ones\nthey were trained on. We address this challenge by introducing state-space\nmodels (SSMs) with learnable timescale parameters to event-based vision. This\ndesign adapts to varying frequencies without the need to retrain the network at\ndifferent frequencies. Additionally, we investigate two strategies to\ncounteract aliasing effects when deploying the model at higher frequencies. We\ncomprehensively evaluate our approach against existing methods based on RNN and\nTransformer architectures across various benchmarks, including Gen1 and 1 Mpx\nevent camera datasets. Our results demonstrate that SSM-based models train 33%\nfaster and also exhibit minimal performance degradation when tested at higher\nfrequencies than the training input. Traditional RNN and Transformer models\nexhibit performance drops of more than 20 mAP, with SSMs having a drop of 3.31\nmAP, highlighting the effectiveness of SSMs in event-based vision tasks.\n","authors":["Nikola Zubić","Mathias Gehrig","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2402.15584v2.pdf","comment":"18 pages, 5 figures, 6 tables, CVPR 2024 Camera Ready paper"},{"id":"http://arxiv.org/abs/2301.07002v3","updated":"2024-04-05T16:50:13Z","published":"2023-01-17T16:44:48Z","title":"Opti-CAM: Optimizing saliency maps for interpretability","summary":" Methods based on class activation maps (CAM) provide a simple mechanism to\ninterpret predictions of convolutional neural networks by using linear\ncombinations of feature maps as saliency maps. By contrast, masking-based\nmethods optimize a saliency map directly in the image space or learn it by\ntraining another network on additional data.\n In this work we introduce Opti-CAM, combining ideas from CAM-based and\nmasking-based approaches. Our saliency map is a linear combination of feature\nmaps, where weights are optimized per image such that the logit of the masked\nimage for a given class is maximized. We also fix a fundamental flaw in two of\nthe most common evaluation metrics of attribution methods. On several datasets,\nOpti-CAM largely outperforms other CAM-based approaches according to the most\nrelevant classification metrics. We provide empirical evidence supporting that\nlocalization and classifier interpretability are not necessarily aligned.\n","authors":["Hanwei Zhang","Felipe Torres","Ronan Sicre","Yannis Avrithis","Stephane Ayache"],"pdf_url":"https://arxiv.org/pdf/2301.07002v3.pdf","comment":"This work is under consideration at \"Computer Vision and Image\n Understanding\""},{"id":"http://arxiv.org/abs/2404.04211v1","updated":"2024-04-05T16:42:16Z","published":"2024-04-05T16:42:16Z","title":"Robust Gaussian Splatting","summary":" In this paper, we address common error sources for 3D Gaussian Splatting\n(3DGS) including blur, imperfect camera poses, and color inconsistencies, with\nthe goal of improving its robustness for practical applications like\nreconstructions from handheld phone captures. Our main contribution involves\nmodeling motion blur as a Gaussian distribution over camera poses, allowing us\nto address both camera pose refinement and motion blur correction in a unified\nway. Additionally, we propose mechanisms for defocus blur compensation and for\naddressing color in-consistencies caused by ambient light, shadows, or due to\ncamera-related factors like varying white balancing settings. Our proposed\nsolutions integrate in a seamless way with the 3DGS formulation while\nmaintaining its benefits in terms of training efficiency and rendering speed.\nWe experimentally validate our contributions on relevant benchmark datasets\nincluding Scannet++ and Deblur-NeRF, obtaining state-of-the-art results and\nthus consistent improvements over relevant baselines.\n","authors":["François Darmon","Lorenzo Porzi","Samuel Rota-Bulò","Peter Kontschieder"],"pdf_url":"https://arxiv.org/pdf/2404.04211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04202v1","updated":"2024-04-05T16:25:39Z","published":"2024-04-05T16:25:39Z","title":"Deep-learning Segmentation of Small Volumes in CT images for\n Radiotherapy Treatment Planning","summary":" Our understanding of organs at risk is progressing to include physical small\ntissues such as coronary arteries and the radiosensitivities of many small\norgans and tissues are high. Therefore, the accurate segmentation of small\nvolumes in external radiotherapy is crucial to protect them from\nover-irradiation. Moreover, with the development of the particle therapy and\non-board imaging, the treatment becomes more accurate and precise. The purpose\nof this work is to optimize organ segmentation algorithms for small organs. We\nused 50 three-dimensional (3-D) computed tomography (CT) head and neck images\nfrom StructSeg2019 challenge to develop a general-purpose V-Net model to\nsegment 20 organs in the head and neck region. We applied specific strategies\nto improve the segmentation accuracy of the small volumes in this anatomical\nregion, i.e., the lens of the eye. Then, we used 17 additional head images from\nOSF healthcare to validate the robustness of the V Net model optimized for\nsmall-volume segmentation. With the study of the StructSeg2019 images, we found\nthat the optimization of the image normalization range and classification\nthreshold yielded a segmentation improvement of the lens of the eye of\napproximately 50%, compared to the use of the V-Net not optimized for small\nvolumes. We used the optimized model to segment 17 images acquired using\nheterogeneous protocols. We obtained comparable Dice coefficient values for the\nclinical and StructSeg2019 images (0.61 plus/minus 0.07 and 0.58 plus/minus\n0.10 for the left and right lens of the eye, respectively)\n","authors":["Jianxin Zhou","Kadishe Fejza","Massimiliano Salvatori","Daniele Della Latta","Gregory M. Hermann","Angela Di Fulvio"],"pdf_url":"https://arxiv.org/pdf/2404.04202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01482v4","updated":"2024-04-05T16:11:19Z","published":"2024-03-03T11:24:16Z","title":"EAGLE: Eigen Aggregation Learning for Object-Centric Unsupervised\n Semantic Segmentation","summary":" Semantic segmentation has innately relied on extensive pixel-level annotated\ndata, leading to the emergence of unsupervised methodologies. Among them,\nleveraging self-supervised Vision Transformers for unsupervised semantic\nsegmentation (USS) has been making steady progress with expressive deep\nfeatures. Yet, for semantically segmenting images with complex objects, a\npredominant challenge remains: the lack of explicit object-level semantic\nencoding in patch-level features. This technical limitation often leads to\ninadequate segmentation of complex objects with diverse structures. To address\nthis gap, we present a novel approach, EAGLE, which emphasizes object-centric\nrepresentation learning for unsupervised semantic segmentation. Specifically,\nwe introduce EiCue, a spectral technique providing semantic and structural cues\nthrough an eigenbasis derived from the semantic similarity matrix of deep image\nfeatures and color affinity from an image. Further, by incorporating our\nobject-centric contrastive loss with EiCue, we guide our model to learn\nobject-level representations with intra- and inter-image object-feature\nconsistency, thereby enhancing semantic accuracy. Extensive experiments on\nCOCO-Stuff, Cityscapes, and Potsdam-3 datasets demonstrate the state-of-the-art\nUSS results of EAGLE with accurate and consistent semantic segmentation across\ncomplex scenes.\n","authors":["Chanyoung Kim","Woojung Han","Dayun Ju","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.01482v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00185v2","updated":"2024-04-05T16:10:44Z","published":"2024-03-29T22:51:45Z","title":"On Inherent Adversarial Robustness of Active Vision Systems","summary":" Current Deep Neural Networks are vulnerable to adversarial examples, which\nalter their predictions by adding carefully crafted noise. Since human eyes are\nrobust to such inputs, it is possible that the vulnerability stems from the\nstandard way of processing inputs in one shot by processing every pixel with\nthe same importance. In contrast, neuroscience suggests that the human vision\nsystem can differentiate salient features by (1) switching between multiple\nfixation points (saccades) and (2) processing the surrounding with a\nnon-uniform external resolution (foveation). In this work, we advocate that the\nintegration of such active vision mechanisms into current deep learning systems\ncan offer robustness benefits. Specifically, we empirically demonstrate the\ninherent robustness of two active vision methods - GFNet and FALcon - under a\nblack box threat model. By learning and inferencing based on downsampled\nglimpses obtained from multiple distinct fixation points within an input, we\nshow that these active methods achieve (2-3) times greater robustness compared\nto a standard passive convolutional network under state-of-the-art adversarial\nattacks. More importantly, we provide illustrative and interpretable\nvisualization analysis that demonstrates how performing inference from distinct\nfixation points makes active vision methods less vulnerable to malicious\ninputs.\n","authors":["Amitangshu Mukherjee","Timur Ibrayev","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2404.00185v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10427v2","updated":"2024-04-05T16:04:40Z","published":"2024-03-15T16:00:04Z","title":"SWAG: Splatting in the Wild images with Appearance-conditioned Gaussians","summary":" Implicit neural representation methods have shown impressive advancements in\nlearning 3D scenes from unstructured in-the-wild photo collections but are\nstill limited by the large computational cost of volumetric rendering. More\nrecently, 3D Gaussian Splatting emerged as a much faster alternative with\nsuperior rendering quality and training efficiency, especially for small-scale\nand object-centric scenarios. Nevertheless, this technique suffers from poor\nperformance on unstructured in-the-wild data. To tackle this, we extend over 3D\nGaussian Splatting to handle unstructured image collections. We achieve this by\nmodeling appearance to seize photometric variations in the rendered images.\nAdditionally, we introduce a new mechanism to train transient Gaussians to\nhandle the presence of scene occluders in an unsupervised manner. Experiments\non diverse photo collection scenes and multi-pass acquisition of outdoor\nlandmarks show the effectiveness of our method over prior works achieving\nstate-of-the-art results with improved efficiency.\n","authors":["Hiba Dahmani","Moussab Bennehar","Nathan Piasco","Luis Roldao","Dzmitry Tsishkou"],"pdf_url":"https://arxiv.org/pdf/2403.10427v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04179v1","updated":"2024-04-05T15:48:36Z","published":"2024-04-05T15:48:36Z","title":"SCAResNet: A ResNet Variant Optimized for Tiny Object Detection in\n Transmission and Distribution Towers","summary":" Traditional deep learning-based object detection networks often resize images\nduring the data preprocessing stage to achieve a uniform size and scale in the\nfeature map. Resizing is done to facilitate model propagation and fully\nconnected classification. However, resizing inevitably leads to object\ndeformation and loss of valuable information in the images. This drawback\nbecomes particularly pronounced for tiny objects like distribution towers with\nlinear shapes and few pixels. To address this issue, we propose abandoning the\nresizing operation. Instead, we introduce Positional-Encoding Multi-head\nCriss-Cross Attention. This allows the model to capture contextual information\nand learn from multiple representation subspaces, effectively enriching the\nsemantics of distribution towers. Additionally, we enhance Spatial Pyramid\nPooling by reshaping three pooled feature maps into a new unified one while\nalso reducing the computational burden. This approach allows images of\ndifferent sizes and scales to generate feature maps with uniform dimensions and\ncan be employed in feature map propagation. Our SCAResNet incorporates these\naforementioned improvements into the backbone network ResNet. We evaluated our\nSCAResNet using the Electric Transmission and Distribution Infrastructure\nImagery dataset from Duke University. Without any additional tricks, we\nemployed various object detection models with Gaussian Receptive Field based\nLabel Assignment as the baseline. When incorporating the SCAResNet into the\nbaseline model, we achieved a 2.1% improvement in mAPs. This demonstrates the\nadvantages of our SCAResNet in detecting transmission and distribution towers\nand its value in tiny object detection. The source code is available at\nhttps://github.com/LisavilaLee/SCAResNet_mmdet.\n","authors":["Weile Li","Muqing Shi","Zhonghua Hong"],"pdf_url":"https://arxiv.org/pdf/2404.04179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09915v2","updated":"2024-04-05T15:45:48Z","published":"2023-07-19T11:35:21Z","title":"Embedded Heterogeneous Attention Transformer for Cross-lingual Image\n Captioning","summary":" Cross-lingual image captioning is a challenging task that requires addressing\nboth cross-lingual and cross-modal obstacles in multimedia analysis. The\ncrucial issue in this task is to model the global and the local matching\nbetween the image and different languages. Existing cross-modal embedding\nmethods based on the transformer architecture oversee the local matching\nbetween the image region and monolingual words, especially when dealing with\ndiverse languages. To overcome these limitations, we propose an Embedded\nHeterogeneous Attention Transformer (EHAT) to establish cross-domain\nrelationships and local correspondences between images and different languages\nby using a heterogeneous network. EHAT comprises Masked Heterogeneous\nCross-attention (MHCA), Heterogeneous Attention Reasoning Network (HARN), and\nHeterogeneous Co-attention (HCA). The HARN serves as the core network and it\ncaptures cross-domain relationships by leveraging visual bounding box\nrepresentation features to connect word features from two languages and to\nlearn heterogeneous maps. MHCA and HCA facilitate cross-domain integration in\nthe encoder through specialized heterogeneous attention mechanisms, enabling a\nsingle model to generate captions in two languages. We evaluate our approach on\nthe MSCOCO dataset to generate captions in English and Chinese, two languages\nthat exhibit significant differences in their language families. The\nexperimental results demonstrate the superior performance of our method\ncompared to existing advanced monolingual methods. Our proposed EHAT framework\neffectively addresses the challenges of cross-lingual image captioning, paving\nthe way for improved multilingual image analysis and understanding.\n","authors":["Zijie Song","Zhenzhen Hu","Yuanen Zhou","Ye Zhao","Richang Hong","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09915v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07087v2","updated":"2024-04-05T15:42:13Z","published":"2024-02-11T02:34:42Z","title":"Self-Correcting Self-Consuming Loops for Generative Model Training","summary":" As synthetic data becomes higher quality and proliferates on the internet,\nmachine learning models are increasingly trained on a mix of human- and\nmachine-generated data. Despite the successful stories of using synthetic data\nfor representation learning, using synthetic data for generative model training\ncreates \"self-consuming loops\" which may lead to training instability or even\ncollapse, unless certain conditions are met. Our paper aims to stabilize\nself-consuming generative model training. Our theoretical results demonstrate\nthat by introducing an idealized correction function, which maps a data point\nto be more likely under the true data distribution, self-consuming loops can be\nmade exponentially more stable. We then propose self-correction functions,\nwhich rely on expert knowledge (e.g. the laws of physics programmed in a\nsimulator), and aim to approximate the idealized corrector automatically and at\nscale. We empirically validate the effectiveness of self-correcting\nself-consuming loops on the challenging human motion synthesis task, and\nobserve that it successfully avoids model collapse, even when the ratio of\nsynthetic data to real data is as high as 100%.\n","authors":["Nate Gillman","Michael Freeman","Daksh Aggarwal","Chia-Hong Hsu","Calvin Luo","Yonglong Tian","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2402.07087v2.pdf","comment":"This new version contains updated mathematical results (c.f. Remark\n 4.4), as well as experiments for an additional generative modeling task.\n Paper under submission; code is available at\n https://nategillman.com/sc-sc.html"},{"id":"http://arxiv.org/abs/2311.08046v3","updated":"2024-04-05T15:21:09Z","published":"2023-11-14T10:11:36Z","title":"Chat-UniVi: Unified Visual Representation Empowers Large Language Models\n with Image and Video Understanding","summary":" Large language models have demonstrated impressive universal capabilities\nacross a wide range of open-ended tasks and have extended their utility to\nencompass multimodal conversations. However, existing methods encounter\nchallenges in effectively handling both image and video understanding,\nparticularly with limited visual tokens. In this work, we introduce Chat-UniVi,\na Unified Vision-language model capable of comprehending and engaging in\nconversations involving images and videos through a unified visual\nrepresentation. Specifically, we employ a set of dynamic visual tokens to\nuniformly represent images and videos. This representation framework empowers\nthe model to efficiently utilize a limited number of visual tokens to\nsimultaneously capture the spatial details necessary for images and the\ncomprehensive temporal relationship required for videos. Moreover, we leverage\na multi-scale representation, enabling the model to perceive both high-level\nsemantic concepts and low-level visual details. Notably, Chat-UniVi is trained\non a mixed dataset containing both images and videos, allowing direct\napplication to tasks involving both mediums without requiring any\nmodifications. Extensive experimental results demonstrate that Chat-UniVi\nconsistently outperforms even existing methods exclusively designed for either\nimages or videos. Code is available at\nhttps://github.com/PKU-YuanGroup/Chat-UniVi.\n","authors":["Peng Jin","Ryuichi Takanobu","Wancai Zhang","Xiaochun Cao","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2311.08046v3.pdf","comment":"Accepted by CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2404.04159v1","updated":"2024-04-05T15:11:09Z","published":"2024-04-05T15:11:09Z","title":"Noisy Label Processing for Classification: A Survey","summary":" In recent years, deep neural networks (DNNs) have gained remarkable\nachievement in computer vision tasks, and the success of DNNs often depends\ngreatly on the richness of data. However, the acquisition process of data and\nhigh-quality ground truth requires a lot of manpower and money. In the long,\ntedious process of data annotation, annotators are prone to make mistakes,\nresulting in incorrect labels of images, i.e., noisy labels. The emergence of\nnoisy labels is inevitable. Moreover, since research shows that DNNs can easily\nfit noisy labels, the existence of noisy labels will cause significant damage\nto the model training process. Therefore, it is crucial to combat noisy labels\nfor computer vision tasks, especially for classification tasks. In this survey,\nwe first comprehensively review the evolution of different deep learning\napproaches for noisy label combating in the image classification task. In\naddition, we also review different noise patterns that have been proposed to\ndesign robust algorithms. Furthermore, we explore the inner pattern of\nreal-world label noise and propose an algorithm to generate a synthetic label\nnoise pattern guided by real-world data. We test the algorithm on the\nwell-known real-world dataset CIFAR-10N to form a new real-world data-guided\nsynthetic benchmark and evaluate some typical noise-robust methods on the\nbenchmark.\n","authors":["Mengting Li","Chuang Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.04159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04155v1","updated":"2024-04-05T15:04:57Z","published":"2024-04-05T15:04:57Z","title":"MarsSeg: Mars Surface Semantic Segmentation with Multi-level Extractor\n and Connector","summary":" The segmentation and interpretation of the Martian surface play a pivotal\nrole in Mars exploration, providing essential data for the trajectory planning\nand obstacle avoidance of rovers. However, the complex topography, similar\nsurface features, and the lack of extensive annotated data pose significant\nchallenges to the high-precision semantic segmentation of the Martian surface.\nTo address these challenges, we propose a novel encoder-decoder based Mars\nsegmentation network, termed MarsSeg. Specifically, we employ an\nencoder-decoder structure with a minimized number of down-sampling layers to\npreserve local details. To facilitate a high-level semantic understanding\nacross the shadow multi-level feature maps, we introduce a feature enhancement\nconnection layer situated between the encoder and decoder. This layer\nincorporates Mini Atrous Spatial Pyramid Pooling (Mini-ASPP), Polarized\nSelf-Attention (PSA), and Strip Pyramid Pooling Module (SPPM). The Mini-ASPP\nand PSA are specifically designed for shadow feature enhancement, thereby\nenabling the expression of local details and small objects. Conversely, the\nSPPM is employed for deep feature enhancement, facilitating the extraction of\nhigh-level semantic category-related information. Experimental results derived\nfrom the Mars-Seg and AI4Mars datasets substantiate that the proposed MarsSeg\noutperforms other state-of-the-art methods in segmentation performance,\nvalidating the efficacy of each proposed component.\n","authors":["Junbo Li","Keyan Chen","Gengju Tian","Lu Li","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2404.04155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14239v2","updated":"2024-04-05T15:00:58Z","published":"2023-12-21T18:59:53Z","title":"PlatoNeRF: 3D Reconstruction in Plato's Cave via Single-View Two-Bounce\n Lidar","summary":" 3D reconstruction from a single-view is challenging because of the ambiguity\nfrom monocular cues and lack of information about occluded regions. Neural\nradiance fields (NeRF), while popular for view synthesis and 3D reconstruction,\nare typically reliant on multi-view images. Existing methods for single-view 3D\nreconstruction with NeRF rely on either data priors to hallucinate views of\noccluded regions, which may not be physically accurate, or shadows observed by\nRGB cameras, which are difficult to detect in ambient light and low albedo\nbackgrounds. We propose using time-of-flight data captured by a single-photon\navalanche diode to overcome these limitations. Our method models two-bounce\noptical paths with NeRF, using lidar transient data for supervision. By\nleveraging the advantages of both NeRF and two-bounce light measured by lidar,\nwe demonstrate that we can reconstruct visible and occluded geometry without\ndata priors or reliance on controlled ambient lighting or scene albedo. In\naddition, we demonstrate improved generalization under practical constraints on\nsensor spatial- and temporal-resolution. We believe our method is a promising\ndirection as single-photon lidars become ubiquitous on consumer devices, such\nas phones, tablets, and headsets.\n","authors":["Tzofi Klinghoffer","Xiaoyu Xiang","Siddharth Somasundaram","Yuchen Fan","Christian Richardt","Ramesh Raskar","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2312.14239v2.pdf","comment":"CVPR 2024. Project Page: https://platonerf.github.io/"},{"id":"http://arxiv.org/abs/2402.01779v2","updated":"2024-04-05T14:57:56Z","published":"2024-02-01T18:05:47Z","title":"Plug-and-Play image restoration with Stochastic deNOising REgularization","summary":" Plug-and-Play (PnP) algorithms are a class of iterative algorithms that\naddress image inverse problems by combining a physical model and a deep neural\nnetwork for regularization. Even if they produce impressive image restoration\nresults, these algorithms rely on a non-standard use of a denoiser on images\nthat are less and less noisy along the iterations, which contrasts with recent\nalgorithms based on Diffusion Models (DM), where the denoiser is applied only\non re-noised images. We propose a new PnP framework, called Stochastic\ndeNOising REgularization (SNORE), which applies the denoiser only on images\nwith noise of the adequate level. It is based on an explicit stochastic\nregularization, which leads to a stochastic gradient descent algorithm to solve\nill-posed inverse problems. A convergence analysis of this algorithm and its\nannealing extension is provided. Experimentally, we prove that SNORE is\ncompetitive with respect to state-of-the-art methods on deblurring and\ninpainting tasks, both quantitatively and qualitatively.\n","authors":["Marien Renaud","Jean Prost","Arthur Leclaire","Nicolas Papadakis"],"pdf_url":"https://arxiv.org/pdf/2402.01779v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02072v3","updated":"2024-04-05T14:48:43Z","published":"2024-04-02T16:20:02Z","title":"EGTR: Extracting Graph from Transformer for Scene Graph Generation","summary":" Scene Graph Generation (SGG) is a challenging task of detecting objects and\npredicting relationships between objects. After DETR was developed, one-stage\nSGG models based on a one-stage object detector have been actively studied.\nHowever, complex modeling is used to predict the relationship between objects,\nand the inherent relationship between object queries learned in the multi-head\nself-attention of the object detector has been neglected. We propose a\nlightweight one-stage SGG model that extracts the relation graph from the\nvarious relationships learned in the multi-head self-attention layers of the\nDETR decoder. By fully utilizing the self-attention by-products, the relation\ngraph can be extracted effectively with a shallow relation extraction head.\nConsidering the dependency of the relation extraction task on the object\ndetection task, we propose a novel relation smoothing technique that adjusts\nthe relation label adaptively according to the quality of the detected objects.\nBy the relation smoothing, the model is trained according to the continuous\ncurriculum that focuses on object detection task at the beginning of training\nand performs multi-task learning as the object detection performance gradually\nimproves. Furthermore, we propose a connectivity prediction task that predicts\nwhether a relation exists between object pairs as an auxiliary task of the\nrelation extraction. We demonstrate the effectiveness and efficiency of our\nmethod for the Visual Genome and Open Image V6 datasets. Our code is publicly\navailable at https://github.com/naver-ai/egtr.\n","authors":["Jinbae Im","JeongYeon Nam","Nokyung Park","Hyungmin Lee","Seunghyun Park"],"pdf_url":"https://arxiv.org/pdf/2404.02072v3.pdf","comment":"CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2312.00690v3","updated":"2024-04-05T14:44:27Z","published":"2023-12-01T16:17:16Z","title":"Open-vocabulary object 6D pose estimation","summary":" We introduce the new setting of open-vocabulary object 6D pose estimation, in\nwhich a textual prompt is used to specify the object of interest. In contrast\nto existing approaches, in our setting (i) the object of interest is specified\nsolely through the textual prompt, (ii) no object model (e.g., CAD or video\nsequence) is required at inference, and (iii) the object is imaged from two\nRGBD viewpoints of different scenes. To operate in this setting, we introduce a\nnovel approach that leverages a Vision-Language Model to segment the object of\ninterest from the scenes and to estimate its relative 6D pose. The key of our\napproach is a carefully devised strategy to fuse object-level information\nprovided by the prompt with local image features, resulting in a feature space\nthat can generalize to novel concepts. We validate our approach on a new\nbenchmark based on two popular datasets, REAL275 and Toyota-Light, which\ncollectively encompass 34 object instances appearing in four thousand image\npairs. The results demonstrate that our approach outperforms both a\nwell-established hand-crafted method and a recent deep learning-based baseline\nin estimating the relative 6D pose of objects in different scenes. Code and\ndataset are available at https://jcorsetti.github.io/oryon.\n","authors":["Jaime Corsetti","Davide Boscaini","Changjae Oh","Andrea Cavallaro","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2312.00690v3.pdf","comment":"Camera ready version (CVPR 2024, poster highlight). 21 pages, 15\n figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.04140v1","updated":"2024-04-05T14:39:13Z","published":"2024-04-05T14:39:13Z","title":"Improving Detection in Aerial Images by Capturing Inter-Object\n Relationships","summary":" In many image domains, the spatial distribution of objects in a scene\nexhibits meaningful patterns governed by their semantic relationships. In most\nmodern detection pipelines, however, the detection proposals are processed\nindependently, overlooking the underlying relationships between objects. In\nthis work, we introduce a transformer-based approach to capture these\ninter-object relationships to refine classification and regression outcomes for\ndetected objects. Building on two-stage detectors, we tokenize the region of\ninterest (RoI) proposals to be processed by a transformer encoder. Specific\nspatial and geometric relations are incorporated into the attention weights and\nadaptively modulated and regularized. Experimental results demonstrate that the\nproposed method achieves consistent performance improvement on three benchmarks\nincluding DOTA-v1.0, DOTA-v1.5, and HRSC 2016, especially ranking first on both\nDOTA-v1.5 and HRSC 2016. Specifically, our new method has an increase of 1.59\nmAP on DOTA-v1.0, 4.88 mAP on DOTA-v1.5, and 2.1 mAP on HRSC 2016,\nrespectively, compared to the baselines.\n","authors":["Botao Ren","Botian Xu","Yifan Pu","Jingyi Wang","Zhidong Deng"],"pdf_url":"https://arxiv.org/pdf/2404.04140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20092v4","updated":"2024-04-05T14:35:26Z","published":"2023-10-31T00:12:14Z","title":"The Missing U for Efficient Diffusion Models","summary":" Diffusion Probabilistic Models stand as a critical tool in generative\nmodelling, enabling the generation of complex data distributions. This family\nof generative models yields record-breaking performance in tasks such as image\nsynthesis, video generation, and molecule design. Despite their capabilities,\ntheir efficiency, especially in the reverse process, remains a challenge due to\nslow convergence rates and high computational costs. In this paper, we\nintroduce an approach that leverages continuous dynamical systems to design a\nnovel denoising network for diffusion models that is more parameter-efficient,\nexhibits faster convergence, and demonstrates increased noise robustness.\nExperimenting with Denoising Diffusion Probabilistic Models (DDPMs), our\nframework operates with approximately a quarter of the parameters, and $\\sim$\n30\\% of the Floating Point Operations (FLOPs) compared to standard U-Nets in\nDDPMs. Furthermore, our model is notably faster in inference than the baseline\nwhen measured in fair and equal conditions. We also provide a mathematical\nintuition as to why our proposed reverse process is faster as well as a\nmathematical discussion of the empirical tradeoffs in the denoising downstream\ntask. Finally, we argue that our method is compatible with existing performance\nenhancement techniques, enabling further improvements in efficiency, quality,\nand speed.\n","authors":["Sergio Calvo-Ordonez","Chun-Wun Cheng","Jiahao Huang","Lipei Zhang","Guang Yang","Carola-Bibiane Schonlieb","Angelica I Aviles-Rivero"],"pdf_url":"https://arxiv.org/pdf/2310.20092v4.pdf","comment":"23 pages, 14 figures, Accepted at Transactions of Machine Learning\n Research (04/2024)"},{"id":"http://arxiv.org/abs/2304.03560v2","updated":"2024-04-05T14:07:25Z","published":"2023-04-07T09:46:29Z","title":"DualRefine: Self-Supervised Depth and Pose Estimation Through Iterative\n Epipolar Sampling and Refinement Toward Equilibrium","summary":" Self-supervised multi-frame depth estimation achieves high accuracy by\ncomputing matching costs of pixel correspondences between adjacent frames,\ninjecting geometric information into the network. These pixel-correspondence\ncandidates are computed based on the relative pose estimates between the\nframes. Accurate pose predictions are essential for precise matching cost\ncomputation as they influence the epipolar geometry. Furthermore, improved\ndepth estimates can, in turn, be used to align pose estimates.\n Inspired by traditional structure-from-motion (SfM) principles, we propose\nthe DualRefine model, which tightly couples depth and pose estimation through a\nfeedback loop. Our novel update pipeline uses a deep equilibrium model\nframework to iteratively refine depth estimates and a hidden state of feature\nmaps by computing local matching costs based on epipolar geometry. Importantly,\nwe used the refined depth estimates and feature maps to compute pose updates at\neach step. This update in the pose estimates slowly alters the epipolar\ngeometry during the refinement process. Experimental results on the KITTI\ndataset demonstrate competitive depth prediction and odometry prediction\nperformance surpassing published self-supervised baselines.\n","authors":["Antyanta Bangunharcana","Ahmed Magd","Kyung-Soo Kim"],"pdf_url":"https://arxiv.org/pdf/2304.03560v2.pdf","comment":"CVPR 2023. Project page:\n https://antabangun.github.io/projects/DualRefine/ Code:\n https://github.com/antabangun/DualRefine"},{"id":"http://arxiv.org/abs/2404.04104v1","updated":"2024-04-05T14:00:07Z","published":"2024-04-05T14:00:07Z","title":"3D Facial Expressions through Analysis-by-Neural-Synthesis","summary":" While existing methods for 3D face reconstruction from in-the-wild images\nexcel at recovering the overall face shape, they commonly miss subtle, extreme,\nasymmetric, or rarely observed expressions. We improve upon these methods with\nSMIRK (Spatial Modeling for Image-based Reconstruction of Kinesics), which\nfaithfully reconstructs expressive 3D faces from images. We identify two key\nlimitations in existing methods: shortcomings in their self-supervised training\nformulation, and a lack of expression diversity in the training images. For\ntraining, most methods employ differentiable rendering to compare a predicted\nface mesh with the input image, along with a plethora of additional loss\nfunctions. This differentiable rendering loss not only has to provide\nsupervision to optimize for 3D face geometry, camera, albedo, and lighting,\nwhich is an ill-posed optimization problem, but the domain gap between\nrendering and input image further hinders the learning process. Instead, SMIRK\nreplaces the differentiable rendering with a neural rendering module that,\ngiven the rendered predicted mesh geometry, and sparsely sampled pixels of the\ninput image, generates a face image. As the neural rendering gets color\ninformation from sampled image pixels, supervising with neural rendering-based\nreconstruction loss can focus solely on the geometry. Further, it enables us to\ngenerate images of the input identity with varying expressions while training.\nThese are then utilized as input to the reconstruction model and used as\nsupervision with ground truth geometry. This effectively augments the training\ndata and enhances the generalization for diverse expressions. Our qualitative,\nquantitative and particularly our perceptual evaluations demonstrate that SMIRK\nachieves the new state-of-the art performance on accurate expression\nreconstruction. Project webpage: https://georgeretsi.github.io/smirk/.\n","authors":["George Retsinas","Panagiotis P. Filntisis","Radek Danecek","Victoria F. Abrevaya","Anastasios Roussos","Timo Bolkart","Petros Maragos"],"pdf_url":"https://arxiv.org/pdf/2404.04104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02702v2","updated":"2024-04-05T13:49:17Z","published":"2023-12-05T12:04:34Z","title":"Neural Sign Actors: A diffusion model for 3D sign language production\n from text","summary":" Sign Languages (SL) serve as the primary mode of communication for the Deaf\nand Hard of Hearing communities. Deep learning methods for SL recognition and\ntranslation have achieved promising results. However, Sign Language Production\n(SLP) poses a challenge as the generated motions must be realistic and have\nprecise semantic meaning. Most SLP methods rely on 2D data, which hinders their\nrealism. In this work, a diffusion-based SLP model is trained on a curated\nlarge-scale dataset of 4D signing avatars and their corresponding text\ntranscripts. The proposed method can generate dynamic sequences of 3D avatars\nfrom an unconstrained domain of discourse using a diffusion process formed on a\nnovel and anatomically informed graph neural network defined on the SMPL-X body\nskeleton. Through quantitative and qualitative experiments, we show that the\nproposed method considerably outperforms previous methods of SLP. This work\nmakes an important step towards realistic neural sign avatars, bridging the\ncommunication gap between Deaf and hearing communities.\n","authors":["Vasileios Baltatzis","Rolandos Alexandros Potamias","Evangelos Ververas","Guanxiong Sun","Jiankang Deng","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2312.02702v2.pdf","comment":"Accepted at CVPR 2024, Project page:\n https://baltatzisv.github.io/neural-sign-actors/"},{"id":"http://arxiv.org/abs/2312.06420v2","updated":"2024-04-05T13:45:11Z","published":"2023-12-11T14:43:23Z","title":"Localization Is All You Evaluate: Data Leakage in Online Mapping\n Datasets and How to Fix It","summary":" The task of online mapping is to predict a local map using current sensor\nobservations, e.g. from lidar and camera, without relying on a pre-built map.\nState-of-the-art methods are based on supervised learning and are trained\npredominantly using two datasets: nuScenes and Argoverse 2. However, these\ndatasets revisit the same geographic locations across training, validation, and\ntest sets. Specifically, over $80$% of nuScenes and $40$% of Argoverse 2\nvalidation and test samples are less than $5$ m from a training sample. At test\ntime, the methods are thus evaluated more on how well they localize within a\nmemorized implicit map built from the training data than on extrapolating to\nunseen locations. Naturally, this data leakage causes inflated performance\nnumbers and we propose geographically disjoint data splits to reveal the true\nperformance in unseen environments. Experimental results show that methods\nperform considerably worse, some dropping more than $45$ mAP, when trained and\nevaluated on proper data splits. Additionally, a reassessment of prior design\nchoices reveals diverging conclusions from those based on the original split.\nNotably, the impact of lifting methods and the support from auxiliary tasks\n(e.g., depth supervision) on performance appears less substantial or follows a\ndifferent trajectory than previously perceived. Splits can be found at\nhttps://github.com/LiljaAdam/geographical-splits\n","authors":["Adam Lilja","Junsheng Fu","Erik Stenborg","Lars Hammarstrand"],"pdf_url":"https://arxiv.org/pdf/2312.06420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04095v1","updated":"2024-04-05T13:44:39Z","published":"2024-04-05T13:44:39Z","title":"Dynamic Prompt Optimizing for Text-to-Image Generation","summary":" Text-to-image generative models, specifically those based on diffusion models\nlike Imagen and Stable Diffusion, have made substantial advancements. Recently,\nthere has been a surge of interest in the delicate refinement of text prompts.\nUsers assign weights or alter the injection time steps of certain words in the\ntext prompts to improve the quality of generated images. However, the success\nof fine-control prompts depends on the accuracy of the text prompts and the\ncareful selection of weights and time steps, which requires significant manual\nintervention. To address this, we introduce the \\textbf{P}rompt\n\\textbf{A}uto-\\textbf{E}diting (PAE) method. Besides refining the original\nprompts for image generation, we further employ an online reinforcement\nlearning strategy to explore the weights and injection time steps of each word,\nleading to the dynamic fine-control prompts. The reward function during\ntraining encourages the model to consider aesthetic score, semantic\nconsistency, and user preferences. Experimental results demonstrate that our\nproposed method effectively improves the original prompts, generating visually\nmore appealing images while maintaining semantic alignment. Code is available\nat https://github.com/Mowenyii/PAE.\n","authors":["Wenyi Mo","Tianyu Zhang","Yalong Bai","Bing Su","Ji-Rong Wen","Qing Yang"],"pdf_url":"https://arxiv.org/pdf/2404.04095v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/1902.06634v4","updated":"2024-04-05T13:03:08Z","published":"2019-02-18T16:15:25Z","title":"Contextual Encoder-Decoder Network for Visual Saliency Prediction","summary":" Predicting salient regions in natural images requires the detection of\nobjects that are present in a scene. To develop robust representations for this\nchallenging task, high-level visual features at multiple spatial scales must be\nextracted and augmented with contextual information. However, existing models\naimed at explaining human fixation maps do not incorporate such a mechanism\nexplicitly. Here we propose an approach based on a convolutional neural network\npre-trained on a large-scale image classification task. The architecture forms\nan encoder-decoder structure and includes a module with multiple convolutional\nlayers at different dilation rates to capture multi-scale features in parallel.\nMoreover, we combine the resulting representations with global scene\ninformation for accurately predicting visual saliency. Our model achieves\ncompetitive and consistent results across multiple evaluation metrics on two\npublic saliency benchmarks and we demonstrate the effectiveness of the\nsuggested approach on five datasets and selected examples. Compared to state of\nthe art approaches, the network is based on a lightweight image classification\nbackbone and hence presents a suitable choice for applications with limited\ncomputational resources, such as (virtual) robotic systems, to estimate human\nfixations across complex natural scenes.\n","authors":["Alexander Kroner","Mario Senden","Kurt Driessens","Rainer Goebel"],"pdf_url":"https://arxiv.org/pdf/1902.06634v4.pdf","comment":"Updated contact information"},{"id":"http://arxiv.org/abs/2404.04072v1","updated":"2024-04-05T12:58:07Z","published":"2024-04-05T12:58:07Z","title":"Label Propagation for Zero-shot Classification with Vision-Language\n Models","summary":" Vision-Language Models (VLMs) have demonstrated impressive performance on\nzero-shot classification, i.e. classification when provided merely with a list\nof class names. In this paper, we tackle the case of zero-shot classification\nin the presence of unlabeled data. We leverage the graph structure of the\nunlabeled data and introduce ZLaP, a method based on label propagation (LP)\nthat utilizes geodesic distances for classification. We tailor LP to graphs\ncontaining both text and image features and further propose an efficient method\nfor performing inductive inference based on a dual solution and a\nsparsification step. We perform extensive experiments to evaluate the\neffectiveness of our method on 14 common datasets and show that ZLaP\noutperforms the latest related works. Code:\nhttps://github.com/vladan-stojnic/ZLaP\n","authors":["Vladan Stojnić","Yannis Kalantidis","Giorgos Tolias"],"pdf_url":"https://arxiv.org/pdf/2404.04072v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.09124v2","updated":"2024-04-05T12:56:31Z","published":"2024-03-14T06:16:21Z","title":"Single Domain Generalization for Crowd Counting","summary":" Due to its promising results, density map regression has been widely employed\nfor image-based crowd counting. The approach, however, often suffers from\nsevere performance degradation when tested on data from unseen scenarios, the\nso-called \"domain shift\" problem. To address the problem, we investigate in\nthis work single domain generalization (SDG) for crowd counting. The existing\nSDG approaches are mainly for image classification and segmentation, and can\nhardly be extended to our case due to its regression nature and label ambiguity\n(i.e., ambiguous pixel-level ground truths). We propose MPCount, a novel\neffective SDG approach even for narrow source distribution. MPCount stores\ndiverse density values for density map regression and reconstructs\ndomain-invariant features by means of only one memory bank, a content error\nmask and attention consistency loss. By partitioning the image into grids, it\nemploys patch-wise classification as an auxiliary task to mitigate label\nambiguity. Through extensive experiments on different datasets, MPCount is\nshown to significantly improve counting accuracy compared to the state of the\nart under diverse scenarios unobserved in the training data characterized by\nnarrow source distribution. Code is available at\nhttps://github.com/Shimmer93/MPCount.\n","authors":["Zhuoxuan Peng","S. -H. Gary Chan"],"pdf_url":"https://arxiv.org/pdf/2403.09124v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2402.19340v2","updated":"2024-04-05T12:49:38Z","published":"2024-02-29T16:46:49Z","title":"One model to use them all: Training a segmentation model with\n complementary datasets","summary":" Understanding a surgical scene is crucial for computer-assisted surgery\nsystems to provide any intelligent assistance functionality. One way of\nachieving this scene understanding is via scene segmentation, where every pixel\nof a frame is classified and therefore identifies the visible structures and\ntissues. Progress on fully segmenting surgical scenes has been made using\nmachine learning. However, such models require large amounts of annotated\ntraining data, containing examples of all relevant object classes. Such fully\nannotated datasets are hard to create, as every pixel in a frame needs to be\nannotated by medical experts and, therefore, are rarely available. In this\nwork, we propose a method to combine multiple partially annotated datasets,\nwhich provide complementary annotations, into one model, enabling better scene\nsegmentation and the use of multiple readily available datasets. Our method\naims to combine available data with complementary labels by leveraging mutual\nexclusive properties to maximize information. Specifically, we propose to use\npositive annotations of other classes as negative samples and to exclude\nbackground pixels of binary annotations, as we cannot tell if they contain a\nclass not annotated but predicted by the model. We evaluate our method by\ntraining a DeepLabV3 on the publicly available Dresden Surgical Anatomy\nDataset, which provides multiple subsets of binary segmented anatomical\nstructures. Our approach successfully combines 6 classes into one model,\nincreasing the overall Dice Score by 4.4% compared to an ensemble of models\ntrained on the classes individually. By including information on multiple\nclasses, we were able to reduce confusion between stomach and colon by 24%. Our\nresults demonstrate the feasibility of training a model on multiple datasets.\nThis paves the way for future work further alleviating the need for one large,\nfully segmented datasets.\n","authors":["Alexander C. Jenke","Sebastian Bodenstedt","Fiona R. Kolbinger","Marius Distler","Jürgen Weitz","Stefanie Speidel"],"pdf_url":"https://arxiv.org/pdf/2402.19340v2.pdf","comment":"Accepted at IPCAI 2024; submitted to IJCARS (under revision)"},{"id":"http://arxiv.org/abs/2404.03443v2","updated":"2024-04-05T12:44:39Z","published":"2024-04-04T13:43:11Z","title":"Part-Attention Based Model Make Occluded Person Re-Identification\n Stronger","summary":" The goal of occluded person re-identification (ReID) is to retrieve specific\npedestrians in occluded situations. However, occluded person ReID still suffers\nfrom background clutter and low-quality local feature representations, which\nlimits model performance. In our research, we introduce a new framework called\nPAB-ReID, which is a novel ReID model incorporating part-attention mechanisms\nto tackle the aforementioned issues effectively. Firstly, we introduce the\nhuman parsing label to guide the generation of more accurate human part\nattention maps. In addition, we propose a fine-grained feature focuser for\ngenerating fine-grained human local feature representations while suppressing\nbackground interference. Moreover, We also design a part triplet loss to\nsupervise the learning of human local features, which optimizes\nintra/inter-class distance. We conducted extensive experiments on specialized\nocclusion and regular ReID datasets, showcasing that our approach outperforms\nthe existing state-of-the-art methods.\n","authors":["Zhihao Chen","Yiyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2404.03443v2.pdf","comment":"Accepted By International Joint Conference on Neural Networks 2024"},{"id":"http://arxiv.org/abs/2403.06546v2","updated":"2024-04-05T12:35:06Z","published":"2024-03-11T09:46:41Z","title":"OMH: Structured Sparsity via Optimally Matched Hierarchy for\n Unsupervised Semantic Segmentation","summary":" Unsupervised Semantic Segmentation (USS) involves segmenting images without\nrelying on predefined labels, aiming to alleviate the burden of extensive human\nlabeling. Existing methods utilize features generated by self-supervised models\nand specific priors for clustering. However, their clustering objectives are\nnot involved in the optimization of the features during training. Additionally,\ndue to the lack of clear class definitions in USS, the resulting segments may\nnot align well with the clustering objective. In this paper, we introduce a\nnovel approach called Optimally Matched Hierarchy (OMH) to simultaneously\naddress the above issues. The core of our method lies in imposing structured\nsparsity on the feature space, which allows the features to encode information\nwith different levels of granularity. The structure of this sparsity stems from\nour hierarchy (OMH). To achieve this, we learn a soft but sparse hierarchy\namong parallel clusters through Optimal Transport. Our OMH yields better\nunsupervised segmentation performance compared to existing USS methods. Our\nextensive experiments demonstrate the benefits of OMH when utilizing our\ndifferentiable paradigm. We will make our code publicly available.\n","authors":["Baran Ozaydin","Tong Zhang","Deblina Bhattacharjee","Sabine Süsstrunk","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2403.06546v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2404.04057v1","updated":"2024-04-05T12:30:19Z","published":"2024-04-05T12:30:19Z","title":"Score identity Distillation: Exponentially Fast Distillation of\n Pretrained Diffusion Models for One-Step Generation","summary":" We introduce Score identity Distillation (SiD), an innovative data-free\nmethod that distills the generative capabilities of pretrained diffusion models\ninto a single-step generator. SiD not only facilitates an exponentially fast\nreduction in Fr\\'echet inception distance (FID) during distillation but also\napproaches or even exceeds the FID performance of the original teacher\ndiffusion models. By reformulating forward diffusion processes as semi-implicit\ndistributions, we leverage three score-related identities to create an\ninnovative loss mechanism. This mechanism achieves rapid FID reduction by\ntraining the generator using its own synthesized images, eliminating the need\nfor real data or reverse-diffusion-based generation, all accomplished within\nsignificantly shortened generation time. Upon evaluation across four benchmark\ndatasets, the SiD algorithm demonstrates high iteration efficiency during\ndistillation and surpasses competing distillation approaches, whether they are\none-step or few-step, data-free, or dependent on training data, in terms of\ngeneration quality. This achievement not only redefines the benchmarks for\nefficiency and effectiveness in diffusion distillation but also in the broader\nfield of diffusion-based generation. Our PyTorch implementation will be\npublicly accessible on GitHub.\n","authors":["Mingyuan Zhou","Huangjie Zheng","Zhendong Wang","Mingzhang Yin","Hai Huang"],"pdf_url":"https://arxiv.org/pdf/2404.04057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13377v2","updated":"2024-04-05T12:14:50Z","published":"2023-12-20T19:08:49Z","title":"SADA: Semantic adversarial unsupervised domain adaptation for Temporal\n Action Localization","summary":" Temporal Action Localization (TAL) is a complex task that poses relevant\nchallenges, particularly when attempting to generalize on new -- unseen --\ndomains in real-world applications. These scenarios, despite realistic, are\noften neglected in the literature, exposing these solutions to important\nperformance degradation. In this work, we tackle this issue by introducing, for\nthe first time, an approach for Unsupervised Domain Adaptation (UDA) in sparse\nTAL, which we refer to as Semantic Adversarial unsupervised Domain Adaptation\n(SADA). Our contributions are threefold: (1) we pioneer the development of a\ndomain adaptation model that operates on realistic sparse action detection\nbenchmarks; (2) we tackle the limitations of global-distribution alignment\ntechniques by introducing a novel adversarial loss that is sensitive to local\nclass distributions, ensuring finer-grained adaptation; and (3) we present a\nnovel set of benchmarks based on EpicKitchens100 and CharadesEgo, that evaluate\nmultiple domain shifts in a comprehensive manner. Our experiments indicate that\nSADA improves the adaptation across domains when compared to fully supervised\nstate-of-the-art and alternative UDA methods, attaining a performance boost of\nup to 6.14% mAP.\n","authors":["David Pujol-Perich","Albert Clapés","Sergio Escalera"],"pdf_url":"https://arxiv.org/pdf/2312.13377v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10344v2","updated":"2024-04-05T12:14:15Z","published":"2024-03-15T14:31:17Z","title":"SCILLA: SurfaCe Implicit Learning for Large Urban Area, a volumetric\n hybrid solution","summary":" Neural implicit surface representation methods have recently shown impressive\n3D reconstruction results. However, existing solutions struggle to reconstruct\nurban outdoor scenes due to their large, unbounded, and highly detailed nature.\nHence, to achieve accurate reconstructions, additional supervision data such as\nLiDAR, strong geometric priors, and long training times are required. To tackle\nsuch issues, we present SCILLA, a new hybrid implicit surface learning method\nto reconstruct large driving scenes from 2D images. SCILLA's hybrid\narchitecture models two separate implicit fields: one for the volumetric\ndensity and another for the signed distance to the surface. To accurately\nrepresent urban outdoor scenarios, we introduce a novel volume-rendering\nstrategy that relies on self-supervised probabilistic density estimation to\nsample points near the surface and transition progressively from volumetric to\nsurface representation. Our solution permits a proper and fast initialization\nof the signed distance field without relying on any geometric prior on the\nscene, compared to concurrent methods. By conducting extensive experiments on\nfour outdoor driving datasets, we show that SCILLA can learn an accurate and\ndetailed 3D surface scene representation in various urban scenarios while being\ntwo times faster to train compared to previous state-of-the-art solutions.\n","authors":["Hala Djeghim","Nathan Piasco","Moussab Bennehar","Luis Roldão","Dzmitry Tsishkou","Désiré Sidibé"],"pdf_url":"https://arxiv.org/pdf/2403.10344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06587v2","updated":"2024-04-05T12:10:30Z","published":"2023-12-11T18:19:36Z","title":"QuickQuakeBuildings: Post-earthquake SAR-Optical Dataset for Quick\n Damaged-building Detection","summary":" Quick and automated earthquake-damaged building detection from post-event\nsatellite imagery is crucial, yet it is challenging due to the scarcity of\ntraining data required to develop robust algorithms. This letter presents the\nfirst dataset dedicated to detecting earthquake-damaged buildings from\npost-event very high resolution (VHR) Synthetic Aperture Radar (SAR) and\noptical imagery. Utilizing open satellite imagery and annotations acquired\nafter the 2023 Turkey-Syria earthquakes, we deliver a dataset of coregistered\nbuilding footprints and satellite image patches of both SAR and optical data,\nencompassing more than four thousand buildings. The task of damaged building\ndetection is formulated as a binary image classification problem, that can also\nbe treated as an anomaly detection problem due to extreme class imbalance. We\nprovide baseline methods and results to serve as references for comparison.\nResearchers can utilize this dataset to expedite algorithm development,\nfacilitating the rapid detection of damaged buildings in response to future\nevents. The dataset and codes together with detailed explanations and\nvisualization are made publicly available at\n\\url{https://github.com/ya0-sun/PostEQ-SARopt-BuildingDamage}.\n","authors":["Yao Sun","Yi Wang","Michael Eineder"],"pdf_url":"https://arxiv.org/pdf/2312.06587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04050v1","updated":"2024-04-05T12:09:36Z","published":"2024-04-05T12:09:36Z","title":"No Time to Train: Empowering Non-Parametric Networks for Few-shot 3D\n Scene Segmentation","summary":" To reduce the reliance on large-scale datasets, recent works in 3D\nsegmentation resort to few-shot learning. Current 3D few-shot segmentation\nmethods first pre-train models on 'seen' classes, and then evaluate their\ngeneralization performance on 'unseen' classes. However, the prior pre-training\nstage not only introduces excessive time overhead but also incurs a significant\ndomain gap on 'unseen' classes. To tackle these issues, we propose a\nNon-parametric Network for few-shot 3D Segmentation, Seg-NN, and its Parametric\nvariant, Seg-PN. Without training, Seg-NN extracts dense representations by\nhand-crafted filters and achieves comparable performance to existing parametric\nmodels. Due to the elimination of pre-training, Seg-NN can alleviate the domain\ngap issue and save a substantial amount of time. Based on Seg-NN, Seg-PN only\nrequires training a lightweight QUEry-Support Transferring (QUEST) module,\nwhich enhances the interaction between the support set and query set.\nExperiments suggest that Seg-PN outperforms previous state-of-the-art method by\n+4.19% and +7.71% mIoU on S3DIS and ScanNet datasets respectively, while\nreducing training time by -90%, indicating its effectiveness and efficiency.\n","authors":["Xiangyang Zhu","Renrui Zhang","Bowei He","Ziyu Guo","Jiaming Liu","Han Xiao","Chaoyou Fu","Hao Dong","Peng Gao"],"pdf_url":"https://arxiv.org/pdf/2404.04050v1.pdf","comment":"CVPR Highlight. Code is available at\n https://github.com/yangyangyang127/Seg-NN. arXiv admin note: text overlap\n with arXiv:2308.12961"},{"id":"http://arxiv.org/abs/2404.04040v1","updated":"2024-04-05T11:49:29Z","published":"2024-04-05T11:49:29Z","title":"Dynamic Risk Assessment Methodology with an LDM-based System for Parking\n Scenarios","summary":" This paper describes the methodology for building a dynamic risk assessment\nfor ADAS (Advanced Driving Assistance Systems) algorithms in parking scenarios,\nfusing exterior and interior perception for a better understanding of the scene\nand a more comprehensive risk estimation. This includes the definition of a\ndynamic risk methodology that depends on the situation from inside and outside\nthe vehicle, the creation of a multi-sensor dataset of risk assessment for ADAS\nbenchmarking purposes, and a Local Dynamic Map (LDM) that fuses data from the\nexterior and interior of the car to build an LDM-based Dynamic Risk Assessment\nSystem (DRAS).\n","authors":["Paola Natalia Cañas","Mikel García","Nerea Aranjuelo","Marcos Nieto","Aitor Iglesias","Igor Rodríguez"],"pdf_url":"https://arxiv.org/pdf/2404.04040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04037v1","updated":"2024-04-05T11:45:03Z","published":"2024-04-05T11:45:03Z","title":"InstructHumans: Editing Animated 3D Human Textures with Instructions","summary":" We present InstructHumans, a novel framework for instruction-driven 3D human\ntexture editing. Existing text-based editing methods use Score Distillation\nSampling (SDS) to distill guidance from generative models. This work shows that\nnaively using such scores is harmful to editing as they destroy consistency\nwith the source avatar. Instead, we propose an alternate SDS for Editing\n(SDS-E) that selectively incorporates subterms of SDS across diffusion\ntimesteps. We further enhance SDS-E with spatial smoothness regularization and\ngradient-based viewpoint sampling to achieve high-quality edits with sharp and\nhigh-fidelity detailing. InstructHumans significantly outperforms existing 3D\nediting methods, consistent with the initial avatar while faithful to the\ntextual instructions. Project page: https://jyzhu.top/instruct-humans .\n","authors":["Jiayin Zhu","Linlin Yang","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2404.04037v1.pdf","comment":"Project Page: https://jyzhu.top/instruct-humans"},{"id":"http://arxiv.org/abs/2312.00648v3","updated":"2024-04-05T11:31:12Z","published":"2023-12-01T15:20:58Z","title":"SPOT: Self-Training with Patch-Order Permutation for Object-Centric\n Learning with Autoregressive Transformers","summary":" Unsupervised object-centric learning aims to decompose scenes into\ninterpretable object entities, termed slots. Slot-based auto-encoders stand out\nas a prominent method for this task. Within them, crucial aspects include\nguiding the encoder to generate object-specific slots and ensuring the decoder\nutilizes them during reconstruction. This work introduces two novel techniques,\n(i) an attention-based self-training approach, which distills superior\nslot-based attention masks from the decoder to the encoder, enhancing object\nsegmentation, and (ii) an innovative patch-order permutation strategy for\nautoregressive transformers that strengthens the role of slot vectors in\nreconstruction. The effectiveness of these strategies is showcased\nexperimentally. The combined approach significantly surpasses prior slot-based\nautoencoder methods in unsupervised object segmentation, especially with\ncomplex real-world images. We provide the implementation code at\nhttps://github.com/gkakogeorgiou/spot .\n","authors":["Ioannis Kakogeorgiou","Spyros Gidaris","Konstantinos Karantzalos","Nikos Komodakis"],"pdf_url":"https://arxiv.org/pdf/2312.00648v3.pdf","comment":"CVPR 2024 (Highlight). Code: https://github.com/gkakogeorgiou/spot"},{"id":"http://arxiv.org/abs/2404.04026v1","updated":"2024-04-05T11:14:19Z","published":"2024-04-05T11:14:19Z","title":"MM-Gaussian: 3D Gaussian-based Multi-modal Fusion for Localization and\n Reconstruction in Unbounded Scenes","summary":" Localization and mapping are critical tasks for various applications such as\nautonomous vehicles and robotics. The challenges posed by outdoor environments\npresent particular complexities due to their unbounded characteristics. In this\nwork, we present MM-Gaussian, a LiDAR-camera multi-modal fusion system for\nlocalization and mapping in unbounded scenes. Our approach is inspired by the\nrecently developed 3D Gaussians, which demonstrate remarkable capabilities in\nachieving high rendering quality and fast rendering speed. Specifically, our\nsystem fully utilizes the geometric structure information provided by\nsolid-state LiDAR to address the problem of inaccurate depth encountered when\nrelying solely on visual solutions in unbounded, outdoor scenarios.\nAdditionally, we utilize 3D Gaussian point clouds, with the assistance of\npixel-level gradient descent, to fully exploit the color information in photos,\nthereby achieving realistic rendering effects. To further bolster the\nrobustness of our system, we designed a relocalization module, which assists in\nreturning to the correct trajectory in the event of a localization failure.\nExperiments conducted in multiple scenarios demonstrate the effectiveness of\nour method.\n","authors":["Chenyang Wu","Yifan Duan","Xinran Zhang","Yu Sheng","Jianmin Ji","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04026v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.04025v1","updated":"2024-04-05T11:13:59Z","published":"2024-04-05T11:13:59Z","title":"Framework to generate perfusion map from CT and CTA images in patients\n with acute ischemic stroke: A longitudinal and cross-sectional study","summary":" Stroke is a leading cause of disability and death. Effective treatment\ndecisions require early and informative vascular imaging. 4D perfusion imaging\nis ideal but rarely available within the first hour after stroke, whereas plain\nCT and CTA usually are. Hence, we propose a framework to extract a predicted\nperfusion map (PPM) derived from CT and CTA images. In all eighteen patients,\nwe found significantly high spatial similarity (with average Spearman's\ncorrelation = 0.7893) between our predicted perfusion map (PPM) and the T-max\nmap derived from 4D-CTP. Voxelwise correlations between the PPM and National\nInstitutes of Health Stroke Scale (NIHSS) subscores for L/R hand motor, gaze,\nand language on a large cohort of 2,110 subjects reliably mapped symptoms to\nexpected infarct locations. Therefore our PPM could serve as an alternative for\n4D perfusion imaging, if the latter is unavailable, to investigate blood\nperfusion in the first hours after hospital admission.\n","authors":["Chayanin Tangwiriyasakul","Pedro Borges","Stefano Moriconi","Paul Wright","Yee-Haur Mah","James Teo","Parashkev Nachev","Sebastien Ourselin","M. Jorge Cardoso"],"pdf_url":"https://arxiv.org/pdf/2404.04025v1.pdf","comment":"Accepted and presented in SWITCH2023: Stroke Workshop on Imaging and\n Treatment CHallenges (MICCAI 2023, Vancouver Canada)"},{"id":"http://arxiv.org/abs/2404.04007v1","updated":"2024-04-05T10:30:38Z","published":"2024-04-05T10:30:38Z","title":"Neural-Symbolic VideoQA: Learning Compositional Spatio-Temporal\n Reasoning for Real-world Video Question Answering","summary":" Compositional spatio-temporal reasoning poses a significant challenge in the\nfield of video question answering (VideoQA). Existing approaches struggle to\nestablish effective symbolic reasoning structures, which are crucial for\nanswering compositional spatio-temporal questions. To address this challenge,\nwe propose a neural-symbolic framework called Neural-Symbolic VideoQA\n(NS-VideoQA), specifically designed for real-world VideoQA tasks. The\nuniqueness and superiority of NS-VideoQA are two-fold: 1) It proposes a Scene\nParser Network (SPN) to transform static-dynamic video scenes into Symbolic\nRepresentation (SR), structuralizing persons, objects, relations, and action\nchronologies. 2) A Symbolic Reasoning Machine (SRM) is designed for top-down\nquestion decompositions and bottom-up compositional reasonings. Specifically, a\npolymorphic program executor is constructed for internally consistent reasoning\nfrom SR to the final answer. As a result, Our NS-VideoQA not only improves the\ncompositional spatio-temporal reasoning in real-world VideoQA task, but also\nenables step-by-step error analysis by tracing the intermediate results.\nExperimental evaluations on the AGQA Decomp benchmark demonstrate the\neffectiveness of the proposed NS-VideoQA framework. Empirical studies further\nconfirm that NS-VideoQA exhibits internal consistency in answering\ncompositional questions and significantly improves the capability of\nspatio-temporal and logical inference for VideoQA tasks.\n","authors":["Lili Liang","Guanglu Sun","Jin Qiu","Lizhong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04007v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20106v2","updated":"2024-04-05T10:29:00Z","published":"2024-03-29T10:40:41Z","title":"Learning Enriched Features via Selective State Spaces Model for\n Efficient Image Deblurring","summary":" Image deblurring aims to restore a high-quality image from its corresponding\nblurred. The emergence of CNNs and Transformers has enabled significant\nprogress. However, these methods often face the dilemma between eliminating\nlong-range degradation perturbations and maintaining computational efficiency.\nWhile the selective state space model (SSM) shows promise in modeling\nlong-range dependencies with linear complexity, it also encounters challenges\nsuch as local pixel forgetting and channel redundancy. To address this issue,\nwe propose an efficient image deblurring network that leverages selective state\nspaces model to aggregate enriched and accurate features. Specifically, we\nintroduce an aggregate local and global information block (ALGBlock) designed\nto effectively capture and integrate both local invariant properties and\nnon-local information. The ALGBlock comprises two primary modules: a module for\ncapturing local and global features (CLGF), and a feature aggregation module\n(FA). The CLGF module is composed of two branches: the global branch captures\nlong-range dependency features via a selective state spaces model, while the\nlocal branch employs simplified channel attention to model local connectivity,\nthereby reducing local pixel forgetting and channel redundancy. In addition, we\ndesign a FA module to accentuate the local part by recalibrating the weight\nduring the aggregation of the two branches for restoration. Experimental\nresults demonstrate that the proposed method outperforms state-of-the-art\napproaches on widely used benchmarks.\n","authors":["Hu Gao","Depeng Dang"],"pdf_url":"https://arxiv.org/pdf/2403.20106v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03999v1","updated":"2024-04-05T10:23:20Z","published":"2024-04-05T10:23:20Z","title":"Finsler-Laplace-Beltrami Operators with Application to Shape Analysis","summary":" The Laplace-Beltrami operator (LBO) emerges from studying manifolds equipped\nwith a Riemannian metric. It is often called the Swiss army knife of geometry\nprocessing as it allows to capture intrinsic shape information and gives rise\nto heat diffusion, geodesic distances, and a multitude of shape descriptors. It\nalso plays a central role in geometric deep learning. In this work, we explore\nFinsler manifolds as a generalization of Riemannian manifolds. We revisit the\nFinsler heat equation and derive a Finsler heat kernel and a\nFinsler-Laplace-Beltrami Operator (FLBO): a novel theoretically justified\nanisotropic Laplace-Beltrami operator (ALBO). In experimental evaluations we\ndemonstrate that the proposed FLBO is a valuable alternative to the traditional\nRiemannian-based LBO and ALBOs for spatial filtering and shape correspondence\nestimation. We hope that the proposed Finsler heat kernel and the FLBO will\ninspire further exploration of Finsler geometry in the computer vision\ncommunity.\n","authors":["Simon Weber","Thomas Dagès","Maolin Gao","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.03999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03998v1","updated":"2024-04-05T10:23:10Z","published":"2024-04-05T10:23:10Z","title":"Physics-Inspired Synthesized Underwater Image Dataset","summary":" This paper introduces the physics-inspired synthesized underwater image\ndataset (PHISWID), a dataset tailored for enhancing underwater image processing\nthrough physics-inspired image synthesis. Deep learning approaches to\nunderwater image enhancement typically demand extensive datasets, yet acquiring\npaired clean and degraded underwater ones poses significant challenges. While\nseveral underwater image datasets have been proposed using physics-based\nsynthesis, a publicly accessible collection has been lacking. Additionally,\nmost underwater image synthesis approaches do not intend to reproduce\natmospheric scenes, resulting in incomplete enhancement. PHISWID addresses this\ngap by offering a set of paired ground-truth (atmospheric) and synthetically\ndegraded underwater images, showcasing not only color degradation but also the\noften-neglected effects of marine snow, a composite of organic matter and sand\nparticles that considerably impairs underwater image clarity. The dataset\napplies these degradations to atmospheric RGB-D images, enhancing the dataset's\nrealism and applicability. PHISWID is particularly valuable for training deep\nneural networks in a supervised learning setting and for objectively assessing\nimage quality in benchmark analyses. Our results reveal that even a basic U-Net\narchitecture, when trained with PHISWID, substantially outperforms existing\nmethods in underwater image enhancement. We intend to release PHISWID publicly,\ncontributing a significant resource to the advancement of underwater imaging\ntechnology.\n","authors":["Reina Kaneko","Hiroshi Higashi","Yuichi Tanaka"],"pdf_url":"https://arxiv.org/pdf/2404.03998v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01734v2","updated":"2024-04-05T10:11:27Z","published":"2023-11-03T06:05:36Z","title":"Sculpting Holistic 3D Representation in Contrastive Language-Image-3D\n Pre-training","summary":" Contrastive learning has emerged as a promising paradigm for 3D open-world\nunderstanding, i.e., aligning point cloud representation to image and text\nembedding space individually. In this paper, we introduce MixCon3D, a simple\nyet effective method aiming to sculpt holistic 3D representation in contrastive\nlanguage-image-3D pre-training. In contrast to point cloud only, we develop the\n3D object-level representation from complementary perspectives, e.g.,\nmulti-view rendered images with the point cloud. Then, MixCon3D performs\nlanguage-3D contrastive learning, comprehensively depicting real-world 3D\nobjects and bolstering text alignment. Additionally, we pioneer the first\nthorough investigation of various training recipes for the 3D contrastive\nlearning paradigm, building a solid baseline with improved performance.\nExtensive experiments conducted on three representative benchmarks reveal that\nour method significantly improves over the baseline, surpassing the previous\nstate-of-the-art performance on the challenging 1,156-category Objaverse-LVIS\ndataset by 5.7%. The versatility of MixCon3D is showcased in applications such\nas text-to-3D retrieval and point cloud captioning, further evidencing its\nefficacy in diverse scenarios. The code is available at\nhttps://github.com/UCSC-VLAA/MixCon3D.\n","authors":["Yipeng Gao","Zeyu Wang","Wei-Shi Zheng","Cihang Xie","Yuyin Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.01734v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2310.17170v3","updated":"2024-04-05T10:07:24Z","published":"2023-10-26T05:49:44Z","title":"MO-YOLO: End-to-End Multiple-Object Tracking Method with YOLO and\n Decoder","summary":" In the field of multi-object tracking (MOT), recent Transformer based\nend-to-end models like MOTR have demonstrated exceptional performance on\ndatasets such as DanceTracker. However, the computational demands of these\nmodels present challenges in training and deployment. Drawing inspiration from\nsuccessful models like GPT, we present MO-YOLO, an efficient and\ncomputationally frugal end-to-end MOT model. MO-YOLO integrates principles from\nYou Only Look Once (YOLO) and RT-DETR, adopting a decoder-only approach. By\nleveraging the decoder from RT-DETR and architectural components from YOLOv8,\nMO-YOLO achieves high speed, shorter training times, and proficient MOT\nperformance. On the Dancetrack, MO-YOLO not only matches MOTR's performance but\nalso surpasses it, achieving over twice the frames per second (MOTR 9.5 FPS,\nMO-YOLO 19.6 FPS). Furthermore, MO-YOLO demonstrates significantly reduced\ntraining times and lower hardware requirements compared to MOTR. This research\nintroduces a promising paradigm for efficient end-to-end MOT, emphasizing\nenhanced performance and resource efficiency.\n","authors":["Liao Pan","Yang Feng","Wu Di","Liu Bo","Zhang Xingle"],"pdf_url":"https://arxiv.org/pdf/2310.17170v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03992v1","updated":"2024-04-05T10:02:32Z","published":"2024-04-05T10:02:32Z","title":"Rolling the dice for better deep learning performance: A study of\n randomness techniques in deep neural networks","summary":" This paper investigates how various randomization techniques impact Deep\nNeural Networks (DNNs). Randomization, like weight noise and dropout, aids in\nreducing overfitting and enhancing generalization, but their interactions are\npoorly understood. The study categorizes randomness techniques into four types\nand proposes new methods: adding noise to the loss function and random masking\nof gradient updates. Using Particle Swarm Optimizer (PSO) for hyperparameter\noptimization, it explores optimal configurations across MNIST, FASHION-MNIST,\nCIFAR10, and CIFAR100 datasets. Over 30,000 configurations are evaluated,\nrevealing data augmentation and weight initialization randomness as main\nperformance contributors. Correlation analysis shows different optimizers\nprefer distinct randomization types. The complete implementation and dataset\nare available on GitHub.\n","authors":["Mohammed Ghaith Altarabichi","Sławomir Nowaczyk","Sepideh Pashami","Peyman Sheikholharam Mashhadi","Julia Handl"],"pdf_url":"https://arxiv.org/pdf/2404.03992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03991v1","updated":"2024-04-05T10:01:31Z","published":"2024-04-05T10:01:31Z","title":"Towards Efficient and Accurate CT Segmentation via Edge-Preserving\n Probabilistic Downsampling","summary":" Downsampling images and labels, often necessitated by limited resources or to\nexpedite network training, leads to the loss of small objects and thin\nboundaries. This undermines the segmentation network's capacity to interpret\nimages accurately and predict detailed labels, resulting in diminished\nperformance compared to processing at original resolutions. This situation\nexemplifies the trade-off between efficiency and accuracy, with higher\ndownsampling factors further impairing segmentation outcomes. Preserving\ninformation during downsampling is especially critical for medical image\nsegmentation tasks. To tackle this challenge, we introduce a novel method named\nEdge-preserving Probabilistic Downsampling (EPD). It utilizes class uncertainty\nwithin a local window to produce soft labels, with the window size dictating\nthe downsampling factor. This enables a network to produce quality predictions\nat low resolutions. Beyond preserving edge details more effectively than\nconventional nearest-neighbor downsampling, employing a similar algorithm for\nimages, it surpasses bilinear interpolation in image downsampling, enhancing\noverall performance. Our method significantly improved Intersection over Union\n(IoU) to 2.85%, 8.65%, and 11.89% when downsampling data to 1/2, 1/4, and 1/8,\nrespectively, compared to conventional interpolation methods.\n","authors":["Shahzad Ali","Yu Rim Lee","Soo Young Park","Won Young Tak","Soon Ki Jung"],"pdf_url":"https://arxiv.org/pdf/2404.03991v1.pdf","comment":"5 pages (4 figures, 1 table); This work has been submitted to the\n IEEE Signal Processing Letters. Copyright may be transferred without notice,\n after which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2403.19655v2","updated":"2024-04-05T09:35:37Z","published":"2024-03-28T17:59:50Z","title":"GaussianCube: Structuring Gaussian Splatting using Optimal Transport for\n 3D Generative Modeling","summary":" 3D Gaussian Splatting (GS) have achieved considerable improvement over Neural\nRadiance Fields in terms of 3D fitting fidelity and rendering speed. However,\nthis unstructured representation with scattered Gaussians poses a significant\nchallenge for generative modeling. To address the problem, we introduce\nGaussianCube, a structured GS representation that is both powerful and\nefficient for generative modeling. We achieve this by first proposing a\nmodified densification-constrained GS fitting algorithm which can yield\nhigh-quality fitting results using a fixed number of free Gaussians, and then\nre-arranging the Gaussians into a predefined voxel grid via Optimal Transport.\nThe structured grid representation allows us to use standard 3D U-Net as our\nbackbone in diffusion generative modeling without elaborate designs. Extensive\nexperiments conducted on ShapeNet and OmniObject3D show that our model achieves\nstate-of-the-art generation results both qualitatively and quantitatively,\nunderscoring the potential of GaussianCube as a powerful and versatile 3D\nrepresentation.\n","authors":["Bowen Zhang","Yiji Cheng","Jiaolong Yang","Chunyu Wang","Feng Zhao","Yansong Tang","Dong Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2403.19655v2.pdf","comment":"Fix typo in Eq.2; Project Page: https://gaussiancube.github.io/"},{"id":"http://arxiv.org/abs/2402.12891v2","updated":"2024-04-05T09:26:07Z","published":"2024-02-20T10:35:51Z","title":"Mind the Exit Pupil Gap: Revisiting the Intrinsics of a Standard\n Plenoptic Camera","summary":" Among the common applications of plenoptic cameras are depth reconstruction\nand post-shot refocusing. These require a calibration relating the camera-side\nlight field to that of the scene. Numerous methods with this goal have been\ndeveloped based on thin lens models for the plenoptic camera's main lens and\nmicrolenses. Our work addresses the often-overlooked role of the main lens exit\npupil in these models and specifically in the decoding process of standard\nplenoptic camera (SPC) images. We formally deduce the connection between the\nrefocusing distance and the resampling parameter for the decoded light field\nand provide an analysis of the errors that arise when the exit pupil is not\nconsidered. In addition, previous work is revisited with respect to the exit\npupil's role and all theoretical results are validated through a\nray-tracing-based simulation. With the public release of the evaluated SPC\ndesigns alongside our simulation and experimental data we aim to contribute to\na more accurate and nuanced understanding of plenoptic camera optics.\n","authors":["Tim Michels","Daniel Mäckelmann","Reinhard Koch"],"pdf_url":"https://arxiv.org/pdf/2402.12891v2.pdf","comment":"29 pages, 16 figures, Accepted for publication in MDPI Sensors,\n Special Issue 'Short-Range Optical 3D Scanning and 3D Data Processing '"},{"id":"http://arxiv.org/abs/2111.05778v2","updated":"2024-04-05T09:19:41Z","published":"2021-11-10T16:31:27Z","title":"Theoretical and Empirical Analysis of a Fast Algorithm for Extracting\n Polygons from Signed Distance Bounds","summary":" Recently there has been renewed interest in signed distance bound\nrepresentations due to their unique properties for 3D shape modelling. This is\nespecially the case for deep learning-based bounds. However, it is beneficial\nto work with polygons in most computer-graphics applications. Thus, in this\npaper we introduce and investigate an asymptotically fast method for\ntransforming signed distance bounds into polygon meshes. This is achieved by\ncombining the principles of sphere tracing (or ray marching) with traditional\npolygonization techniques, such as Marching Cubes. We provide theoretical and\nexperimental evidence that this approach is of the $O(N^2\\log N)$ computational\ncomplexity for a polygonization grid with $N^3$ cells. The algorithm is tested\non both a set of primitive shapes as well as signed distance bounds generated\nfrom point clouds by machine learning (and represented as neural networks).\nGiven its speed, implementation simplicity and portability, we argue that it\ncould prove useful during the modelling stage as well as in shape compression\nfor storage.\n The code is available here: https://github.com/nenadmarkus/gridhopping\n","authors":["Nenad Markuš","Mirko Sužnjević"],"pdf_url":"https://arxiv.org/pdf/2111.05778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04868v3","updated":"2024-04-05T09:03:48Z","published":"2023-08-09T11:02:00Z","title":"InstantAvatar: Efficient 3D Head Reconstruction via Surface Rendering","summary":" Recent advances in full-head reconstruction have been obtained by optimizing\na neural field through differentiable surface or volume rendering to represent\na single scene. While these techniques achieve an unprecedented accuracy, they\ntake several minutes, or even hours, due to the expensive optimization process\nrequired. In this work, we introduce InstantAvatar, a method that recovers\nfull-head avatars from few images (down to just one) in a few seconds on\ncommodity hardware. In order to speed up the reconstruction process, we propose\na system that combines, for the first time, a voxel-grid neural field\nrepresentation with a surface renderer. Notably, a naive combination of these\ntwo techniques leads to unstable optimizations that do not converge to valid\nsolutions. In order to overcome this limitation, we present a novel statistical\nmodel that learns a prior distribution over 3D head signed distance functions\nusing a voxel-grid based architecture. The use of this prior model, in\ncombination with other design choices, results into a system that achieves 3D\nhead reconstructions with comparable accuracy as the state-of-the-art with a\n100x speed-up.\n","authors":["Antonio Canela","Pol Caselles","Ibrar Malik","Eduard Ramon","Jaime García","Jordi Sánchez-Riera","Gil Triginer","Francesc Moreno-Noguer"],"pdf_url":"https://arxiv.org/pdf/2308.04868v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00434v2","updated":"2024-04-05T08:53:28Z","published":"2023-04-30T09:28:38Z","title":"EVREAL: Towards a Comprehensive Benchmark and Analysis Suite for\n Event-based Video Reconstruction","summary":" Event cameras are a new type of vision sensor that incorporates asynchronous\nand independent pixels, offering advantages over traditional frame-based\ncameras such as high dynamic range and minimal motion blur. However, their\noutput is not easily understandable by humans, making the reconstruction of\nintensity images from event streams a fundamental task in event-based vision.\nWhile recent deep learning-based methods have shown promise in video\nreconstruction from events, this problem is not completely solved yet. To\nfacilitate comparison between different approaches, standardized evaluation\nprotocols and diverse test datasets are essential. This paper proposes a\nunified evaluation methodology and introduces an open-source framework called\nEVREAL to comprehensively benchmark and analyze various event-based video\nreconstruction methods from the literature. Using EVREAL, we give a detailed\nanalysis of the state-of-the-art methods for event-based video reconstruction,\nand provide valuable insights into the performance of these methods under\nvarying settings, challenging scenarios, and downstream tasks.\n","authors":["Burak Ercan","Onur Eker","Aykut Erdem","Erkut Erdem"],"pdf_url":"https://arxiv.org/pdf/2305.00434v2.pdf","comment":"19 pages, 9 figures. Has been accepted for publication at the IEEE\n Conference on Computer Vision and Pattern Recognition Workshops (CVPRW),\n Vancouver, 2023. The project page can be found at\n https://ercanburak.github.io/evreal.html"},{"id":"http://arxiv.org/abs/2404.03962v1","updated":"2024-04-05T08:52:32Z","published":"2024-04-05T08:52:32Z","title":"RaSim: A Range-aware High-fidelity RGB-D Data Simulation Pipeline for\n Real-world Applications","summary":" In robotic vision, a de-facto paradigm is to learn in simulated environments\nand then transfer to real-world applications, which poses an essential\nchallenge in bridging the sim-to-real domain gap. While mainstream works tackle\nthis problem in the RGB domain, we focus on depth data synthesis and develop a\nrange-aware RGB-D data simulation pipeline (RaSim). In particular,\nhigh-fidelity depth data is generated by imitating the imaging principle of\nreal-world sensors. A range-aware rendering strategy is further introduced to\nenrich data diversity. Extensive experiments show that models trained with\nRaSim can be directly applied to real-world scenarios without any finetuning\nand excel at downstream RGB-D perception tasks.\n","authors":["Xingyu Liu","Chenyangguang Zhang","Gu Wang","Ruida Zhang","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2404.03962v1.pdf","comment":"accepted by ICRA'24"},{"id":"http://arxiv.org/abs/2403.01300v2","updated":"2024-04-05T08:42:02Z","published":"2024-03-02T19:54:53Z","title":"Causal Mode Multiplexer: A Novel Framework for Unbiased Multispectral\n Pedestrian Detection","summary":" RGBT multispectral pedestrian detection has emerged as a promising solution\nfor safety-critical applications that require day/night operations. However,\nthe modality bias problem remains unsolved as multispectral pedestrian\ndetectors learn the statistical bias in datasets. Specifically, datasets in\nmultispectral pedestrian detection mainly distribute between ROTO (day) and\nRXTO (night) data; the majority of the pedestrian labels statistically co-occur\nwith their thermal features. As a result, multispectral pedestrian detectors\nshow poor generalization ability on examples beyond this statistical\ncorrelation, such as ROTX data. To address this problem, we propose a novel\nCausal Mode Multiplexer (CMM) framework that effectively learns the causalities\nbetween multispectral inputs and predictions. Moreover, we construct a new\ndataset (ROTX-MP) to evaluate modality bias in multispectral pedestrian\ndetection. ROTX-MP mainly includes ROTX examples not presented in previous\ndatasets. Extensive experiments demonstrate that our proposed CMM framework\ngeneralizes well on existing datasets (KAIST, CVC-14, FLIR) and the new\nROTX-MP. We will release our new dataset to the public for future research.\n","authors":["Taeheon Kim","Sebin Shin","Youngjoon Yu","Hak Gu Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2403.01300v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2401.01598v2","updated":"2024-04-05T08:23:29Z","published":"2024-01-03T07:59:17Z","title":"Learning Prompt with Distribution-Based Feature Replay for Few-Shot\n Class-Incremental Learning","summary":" Few-shot Class-Incremental Learning (FSCIL) aims to continuously learn new\nclasses based on very limited training data without forgetting the old ones\nencountered. Existing studies solely relied on pure visual networks, while in\nthis paper we solved FSCIL by leveraging the Vision-Language model (e.g., CLIP)\nand propose a simple yet effective framework, named Learning Prompt with\nDistribution-based Feature Replay (LP-DiF). We observe that simply using CLIP\nfor zero-shot evaluation can substantially outperform the most influential\nmethods. Then, prompt tuning technique is involved to further improve its\nadaptation ability, allowing the model to continually capture specific\nknowledge from each session. To prevent the learnable prompt from forgetting\nold knowledge in the new session, we propose a pseudo-feature replay approach.\nSpecifically, we preserve the old knowledge of each class by maintaining a\nfeature-level Gaussian distribution with a diagonal covariance matrix, which is\nestimated by the image features of training images and synthesized features\ngenerated from a VAE. When progressing to a new session, pseudo-features are\nsampled from old-class distributions combined with training images of the\ncurrent session to optimize the prompt, thus enabling the model to learn new\nknowledge while retaining old knowledge. Experiments on three prevalent\nbenchmarks, i.e., CIFAR100, mini-ImageNet, CUB-200, and two more challenging\nbenchmarks, i.e., SUN-397 and CUB-200$^*$ proposed in this paper showcase the\nsuperiority of LP-DiF, achieving new state-of-the-art (SOTA) in FSCIL. Code is\npublicly available at https://github.com/1170300714/LP-DiF.\n","authors":["Zitong Huang","Ze Chen","Zhixing Chen","Erjin Zhou","Xinxing Xu","Rick Siow Mong Goh","Yong Liu","Wangmeng Zuo","Chunmei Feng"],"pdf_url":"https://arxiv.org/pdf/2401.01598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10835v4","updated":"2024-04-05T08:20:32Z","published":"2023-12-17T22:40:38Z","title":"Your Student is Better Than Expected: Adaptive Teacher-Student\n Collaboration for Text-Conditional Diffusion Models","summary":" Knowledge distillation methods have recently shown to be a promising\ndirection to speedup the synthesis of large-scale diffusion models by requiring\nonly a few inference steps. While several powerful distillation methods were\nrecently proposed, the overall quality of student samples is typically lower\ncompared to the teacher ones, which hinders their practical usage. In this\nwork, we investigate the relative quality of samples produced by the teacher\ntext-to-image diffusion model and its distilled student version. As our main\nempirical finding, we discover that a noticeable portion of student samples\nexhibit superior fidelity compared to the teacher ones, despite the\n\"approximate\" nature of the student. Based on this finding, we propose an\nadaptive collaboration between student and teacher diffusion models for\neffective text-to-image synthesis. Specifically, the distilled model produces\nthe initial sample, and then an oracle decides whether it needs further\nimprovements with a slow teacher model. Extensive experiments demonstrate that\nthe designed pipeline surpasses state-of-the-art text-to-image alternatives for\nvarious inference budgets in terms of human preference. Furthermore, the\nproposed approach can be naturally used in popular applications such as\ntext-guided image editing and controllable generation.\n","authors":["Nikita Starodubcev","Artem Fedorov","Artem Babenko","Dmitry Baranchuk"],"pdf_url":"https://arxiv.org/pdf/2312.10835v4.pdf","comment":"CVPR2024 camera ready v2"},{"id":"http://arxiv.org/abs/2404.03936v1","updated":"2024-04-05T07:44:17Z","published":"2024-04-05T07:44:17Z","title":"Deep Learning for Satellite Image Time Series Analysis: A Review","summary":" Earth observation (EO) satellite missions have been providing detailed images\nabout the state of the Earth and its land cover for over 50 years. Long term\nmissions, such as NASA's Landsat, Terra, and Aqua satellites, and more\nrecently, the ESA's Sentinel missions, record images of the entire world every\nfew days. Although single images provide point-in-time data, repeated images of\nthe same area, or satellite image time series (SITS) provide information about\nthe changing state of vegetation and land use. These SITS are useful for\nmodeling dynamic processes and seasonal changes such as plant phenology. They\nhave potential benefits for many aspects of land and natural resource\nmanagement, including applications in agricultural, forest, water, and disaster\nmanagement, urban planning, and mining. However, the resulting satellite image\ntime series (SITS) are complex, incorporating information from the temporal,\nspatial, and spectral dimensions. Therefore, deep learning methods are often\ndeployed as they can analyze these complex relationships. This review presents\na summary of the state-of-the-art methods of modelling environmental,\nagricultural, and other Earth observation variables from SITS data using deep\nlearning methods. We aim to provide a resource for remote sensing experts\ninterested in using deep learning techniques to enhance Earth observation\nmodels with temporal information.\n","authors":["Lynn Miller","Charlotte Pelletier","Geoffrey I. Webb"],"pdf_url":"https://arxiv.org/pdf/2404.03936v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2404.03930v1","updated":"2024-04-05T07:24:10Z","published":"2024-04-05T07:24:10Z","title":"Real-GDSR: Real-World Guided DSM Super-Resolution via Edge-Enhancing\n Residual Network","summary":" A low-resolution digital surface model (DSM) features distinctive attributes\nimpacted by noise, sensor limitations and data acquisition conditions, which\nfailed to be replicated using simple interpolation methods like bicubic. This\ncauses super-resolution models trained on synthetic data does not perform\neffectively on real ones. Training a model on real low and high resolution DSMs\npairs is also a challenge because of the lack of information. On the other\nhand, the existence of other imaging modalities of the same scene can be used\nto enrich the information needed for large-scale super-resolution. In this\nwork, we introduce a novel methodology to address the intricacies of real-world\nDSM super-resolution, named REAL-GDSR, breaking down this ill-posed problem\ninto two steps. The first step involves the utilization of a residual local\nrefinement network. This strategic approach departs from conventional methods\nthat trained to directly predict height values instead of the differences\n(residuals) and utilize large receptive fields in their networks. The second\nstep introduces a diffusion-based technique that enhances the results on a\nglobal scale, with a primary focus on smoothing and edge preservation. Our\nexperiments underscore the effectiveness of the proposed method. We conduct a\ncomprehensive evaluation, comparing it to recent state-of-the-art techniques in\nthe domain of real-world DSM super-resolution (SR). Our approach consistently\noutperforms these existing methods, as evidenced through qualitative and\nquantitative assessments.\n","authors":["Daniel Panangian","Ksenia Bittner"],"pdf_url":"https://arxiv.org/pdf/2404.03930v1.pdf","comment":"Accepted for publication in the ISPRS Annals of Photogrammetry,\n Remote Sensing, and Spatial Information Sciences"},{"id":"http://arxiv.org/abs/2404.03925v1","updated":"2024-04-05T07:15:06Z","published":"2024-04-05T07:15:06Z","title":"LightOctree: Lightweight 3D Spatially-Coherent Indoor Lighting\n Estimation","summary":" We present a lightweight solution for estimating spatially-coherent indoor\nlighting from a single RGB image. Previous methods for estimating illumination\nusing volumetric representations have overlooked the sparse distribution of\nlight sources in space, necessitating substantial memory and computational\nresources for achieving high-quality results. We introduce a unified, voxel\noctree-based illumination estimation framework to produce 3D spatially-coherent\nlighting. Additionally, a differentiable voxel octree cone tracing rendering\nlayer is proposed to eliminate regular volumetric representation throughout the\nentire process and ensure the retention of features across different frequency\ndomains. This reduction significantly decreases spatial usage and required\nfloating-point operations without substantially compromising precision.\nExperimental results demonstrate that our approach achieves high-quality\ncoherent estimation with minimal cost compared to previous methods.\n","authors":["Xuecan Wang","Shibang Xiao","Xiaohui Liang"],"pdf_url":"https://arxiv.org/pdf/2404.03925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03924v1","updated":"2024-04-05T07:13:28Z","published":"2024-04-05T07:13:28Z","title":"Learning Correlation Structures for Vision Transformers","summary":" We introduce a new attention mechanism, dubbed structural self-attention\n(StructSA), that leverages rich correlation patterns naturally emerging in\nkey-query interactions of attention. StructSA generates attention maps by\nrecognizing space-time structures of key-query correlations via convolution and\nuses them to dynamically aggregate local contexts of value features. This\neffectively leverages rich structural patterns in images and videos such as\nscene layouts, object motion, and inter-object relations. Using StructSA as a\nmain building block, we develop the structural vision transformer (StructViT)\nand evaluate its effectiveness on both image and video classification tasks,\nachieving state-of-the-art results on ImageNet-1K, Kinetics-400,\nSomething-Something V1 & V2, Diving-48, and FineGym.\n","authors":["Manjin Kim","Paul Hongsuck Seo","Cordelia Schmid","Minsu Cho"],"pdf_url":"https://arxiv.org/pdf/2404.03924v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2401.02723v2","updated":"2024-04-05T07:12:16Z","published":"2024-01-05T09:36:42Z","title":"Predicting Traffic Flow with Federated Learning and Graph Neural with\n Asynchronous Computations Network","summary":" Real-time traffic flow prediction holds significant importance within the\ndomain of Intelligent Transportation Systems (ITS). The task of achieving a\nbalance between prediction precision and computational efficiency presents a\nsignificant challenge. In this article, we present a novel deep-learning method\ncalled Federated Learning and Asynchronous Graph Convolutional Network\n(FLAGCN). Our framework incorporates the principles of asynchronous graph\nconvolutional networks with federated learning to enhance the accuracy and\nefficiency of real-time traffic flow prediction. The FLAGCN model employs a\nspatial-temporal graph convolution technique to asynchronously address\nspatio-temporal dependencies within traffic data effectively. To efficiently\nhandle the computational requirements associated with this deep learning model,\nthis study used a graph federated learning technique known as GraphFL. This\napproach is designed to facilitate the training process. The experimental\nresults obtained from conducting tests on two distinct traffic datasets\ndemonstrate that the utilization of FLAGCN leads to the optimization of both\ntraining and inference durations while maintaining a high level of prediction\naccuracy. FLAGCN outperforms existing models with significant improvements by\nachieving up to approximately 6.85% reduction in RMSE, 20.45% reduction in\nMAPE, compared to the best-performing existing models.\n","authors":["Muhammad Yaqub","Shahzad Ahmad","Malik Abdul Manan","Imran Shabir Chuhan"],"pdf_url":"https://arxiv.org/pdf/2401.02723v2.pdf","comment":"I request to withdraw my paper from arXiv due to significant updates\n and improvements identified post-submission. These enhancements will\n substantially elevate the work's quality and impact. I plan to resubmit the\n revised paper upon completion of these updates. Thank you for accommodating\n this request"},{"id":"http://arxiv.org/abs/2402.19326v2","updated":"2024-04-05T06:56:08Z","published":"2024-02-29T16:29:53Z","title":"Generalizable Whole Slide Image Classification with Fine-Grained\n Visual-Semantic Interaction","summary":" Whole Slide Image (WSI) classification is often formulated as a Multiple\nInstance Learning (MIL) problem. Recently, Vision-Language Models (VLMs) have\ndemonstrated remarkable performance in WSI classification. However, existing\nmethods leverage coarse-grained pathogenetic descriptions for visual\nrepresentation supervision, which are insufficient to capture the complex\nvisual appearance of pathogenetic images, hindering the generalizability of\nmodels on diverse downstream tasks. Additionally, processing high-resolution\nWSIs can be computationally expensive. In this paper, we propose a novel\n\"Fine-grained Visual-Semantic Interaction\" (FiVE) framework for WSI\nclassification. It is designed to enhance the model's generalizability by\nleveraging the interaction between localized visual patterns and fine-grained\npathological semantics. Specifically, with meticulously designed queries, we\nstart by utilizing a large language model to extract fine-grained pathological\ndescriptions from various non-standardized raw reports. The output descriptions\nare then reconstructed into fine-grained labels used for training. By\nintroducing a Task-specific Fine-grained Semantics (TFS) module, we enable\nprompts to capture crucial visual information in WSIs, which enhances\nrepresentation learning and augments generalization capabilities significantly.\nFurthermore, given that pathological visual patterns are redundantly\ndistributed across tissue slices, we sample a subset of visual instances during\ntraining. Our method demonstrates robust generalizability and strong\ntransferability, dominantly outperforming the counterparts on the TCGA Lung\nCancer dataset with at least 9.19% higher accuracy in few-shot experiments. The\ncode is available at: https://github.com/ls1rius/WSI_FiVE.\n","authors":["Hao Li","Ying Chen","Yifei Chen","Wenxian Yang","Bowen Ding","Yuchen Han","Liansheng Wang","Rongshan Yu"],"pdf_url":"https://arxiv.org/pdf/2402.19326v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03913v1","updated":"2024-04-05T06:41:27Z","published":"2024-04-05T06:41:27Z","title":"Concept Weaver: Enabling Multi-Concept Fusion in Text-to-Image Models","summary":" While there has been significant progress in customizing text-to-image\ngeneration models, generating images that combine multiple personalized\nconcepts remains challenging. In this work, we introduce Concept Weaver, a\nmethod for composing customized text-to-image diffusion models at inference\ntime. Specifically, the method breaks the process into two steps: creating a\ntemplate image aligned with the semantics of input prompts, and then\npersonalizing the template using a concept fusion strategy. The fusion strategy\nincorporates the appearance of the target concepts into the template image\nwhile retaining its structural details. The results indicate that our method\ncan generate multiple custom concepts with higher identity fidelity compared to\nalternative approaches. Furthermore, the method is shown to seamlessly handle\nmore than two concepts and closely follow the semantic meaning of the input\nprompt without blending appearances across different subjects.\n","authors":["Gihyun Kwon","Simon Jenni","Dingzeyu Li","Joon-Young Lee","Jong Chul Ye","Fabian Caba Heilbron"],"pdf_url":"https://arxiv.org/pdf/2404.03913v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03906v1","updated":"2024-04-05T05:58:40Z","published":"2024-04-05T05:58:40Z","title":"Deep Phase Coded Image Prior","summary":" Phase-coded imaging is a computational imaging method designed to tackle\ntasks such as passive depth estimation and extended depth of field (EDOF) using\ndepth cues inserted during image capture. Most of the current deep\nlearning-based methods for depth estimation or all-in-focus imaging require a\ntraining dataset with high-quality depth maps and an optimal focus point at\ninfinity for all-in-focus images. Such datasets are difficult to create,\nusually synthetic, and require external graphic programs. We propose a new\nmethod named \"Deep Phase Coded Image Prior\" (DPCIP) for jointly recovering the\ndepth map and all-in-focus image from a coded-phase image using solely the\ncaptured image and the optical information of the imaging system. Our approach\ndoes not depend on any specific dataset and surpasses prior supervised\ntechniques utilizing the same imaging system. This improvement is achieved\nthrough the utilization of a problem formulation based on implicit neural\nrepresentation (INR) and deep image prior (DIP). Due to our zero-shot method,\nwe overcome the barrier of acquiring accurate ground-truth data of depth maps\nand all-in-focus images for each new phase-coded system introduced. This allows\nfocusing mainly on developing the imaging system, and not on ground-truth data\ncollection.\n","authors":["Nimrod Shabtay","Eli Schwartz","Raja Giryes"],"pdf_url":"https://arxiv.org/pdf/2404.03906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01439v2","updated":"2024-04-05T05:46:59Z","published":"2024-03-03T08:25:04Z","title":"Dynamic Adapter Meets Prompt Tuning: Parameter-Efficient Transfer\n Learning for Point Cloud Analysis","summary":" Point cloud analysis has achieved outstanding performance by transferring\npoint cloud pre-trained models. However, existing methods for model adaptation\nusually update all model parameters, i.e., full fine-tuning paradigm, which is\ninefficient as it relies on high computational costs (e.g., training GPU\nmemory) and massive storage space. In this paper, we aim to study\nparameter-efficient transfer learning for point cloud analysis with an ideal\ntrade-off between task performance and parameter efficiency. To achieve this\ngoal, we freeze the parameters of the default pre-trained models and then\npropose the Dynamic Adapter, which generates a dynamic scale for each token,\nconsidering the token significance to the downstream task. We further\nseamlessly integrate Dynamic Adapter with Prompt Tuning (DAPT) by constructing\nInternal Prompts, capturing the instance-specific features for interaction.\nExtensive experiments conducted on five challenging datasets demonstrate that\nthe proposed DAPT achieves superior performance compared to the full\nfine-tuning counterparts while significantly reducing the trainable parameters\nand training GPU memory by 95% and 35%, respectively. Code is available at\nhttps://github.com/LMD0311/DAPT.\n","authors":["Xin Zhou","Dingkang Liang","Wei Xu","Xingkui Zhu","Yihan Xu","Zhikang Zou","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2403.01439v2.pdf","comment":"Accepted to CVPR 2024. Code is available at\n https://github.com/LMD0311/DAPT"},{"id":"http://arxiv.org/abs/2404.03898v1","updated":"2024-04-05T05:42:23Z","published":"2024-04-05T05:42:23Z","title":"VoltaVision: A Transfer Learning model for electronic component\n classification","summary":" In this paper, we analyze the effectiveness of transfer learning on\nclassifying electronic components. Transfer learning reuses pre-trained models\nto save time and resources in building a robust classifier rather than learning\nfrom scratch. Our work introduces a lightweight CNN, coined as VoltaVision, and\ncompares its performance against more complex models. We test the hypothesis\nthat transferring knowledge from a similar task to our target domain yields\nbetter results than state-of-the-art models trained on general datasets. Our\ndataset and code for this work are available at\nhttps://github.com/AnasIshfaque/VoltaVision.\n","authors":["Anas Mohammad Ishfaqul Muktadir Osmani","Taimur Rahman","Salekul Islam"],"pdf_url":"https://arxiv.org/pdf/2404.03898v1.pdf","comment":"Tiny Paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2310.20550v3","updated":"2024-04-05T05:29:29Z","published":"2023-10-31T15:31:39Z","title":"CapsFusion: Rethinking Image-Text Data at Scale","summary":" Large multimodal models demonstrate remarkable generalist ability to perform\ndiverse multimodal tasks in a zero-shot manner. Large-scale web-based\nimage-text pairs contribute fundamentally to this success, but suffer from\nexcessive noise. Recent studies use alternative captions synthesized by\ncaptioning models and have achieved notable benchmark performance. However, our\nexperiments reveal significant Scalability Deficiency and World Knowledge Loss\nissues in models trained with synthetic captions, which have been largely\nobscured by their initial benchmark success. Upon closer examination, we\nidentify the root cause as the overly-simplified language structure and lack of\nknowledge details in existing synthetic captions. To provide higher-quality and\nmore scalable multimodal pretraining data, we propose CapsFusion, an advanced\nframework that leverages large language models to consolidate and refine\ninformation from both web-based image-text pairs and synthetic captions.\nExtensive experiments show that CapsFusion captions exhibit remarkable\nall-round superiority over existing captions in terms of model performance\n(e.g., 18.8 and 18.3 improvements in CIDEr score on COCO and NoCaps), sample\nefficiency (requiring 11-16 times less computation than baselines), world\nknowledge depth, and scalability. These effectiveness, efficiency and\nscalability advantages position CapsFusion as a promising candidate for future\nscaling of LMM training.\n","authors":["Qiying Yu","Quan Sun","Xiaosong Zhang","Yufeng Cui","Fan Zhang","Yue Cao","Xinlong Wang","Jingjing Liu"],"pdf_url":"https://arxiv.org/pdf/2310.20550v3.pdf","comment":"CVPR 2024. Code & Dataset: https://github.com/baaivision/CapsFusion"},{"id":"http://arxiv.org/abs/2404.03892v1","updated":"2024-04-05T05:00:21Z","published":"2024-04-05T05:00:21Z","title":"Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and\n Integration of Convolutional Neural Networks and Explainable AI","summary":" The study introduces an integrated framework combining Convolutional Neural\nNetworks (CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced\ndiagnosis of breast cancer using the CBIS-DDSM dataset. Utilizing a fine-tuned\nResNet50 architecture, our investigation not only provides effective\ndifferentiation of mammographic images into benign and malignant categories but\nalso addresses the opaque \"black-box\" nature of deep learning models by\nemploying XAI methodologies, namely Grad-CAM, LIME, and SHAP, to interpret CNN\ndecision-making processes for healthcare professionals. Our methodology\nencompasses an elaborate data preprocessing pipeline and advanced data\naugmentation techniques to counteract dataset limitations, and transfer\nlearning using pre-trained networks, such as VGG-16, DenseNet and ResNet was\nemployed. A focal point of our study is the evaluation of XAI's effectiveness\nin interpreting model predictions, highlighted by utilising the Hausdorff\nmeasure to assess the alignment between AI-generated explanations and expert\nannotations quantitatively. This approach plays a critical role for XAI in\npromoting trustworthiness and ethical fairness in AI-assisted diagnostics. The\nfindings from our research illustrate the effective collaboration between CNNs\nand XAI in advancing diagnostic methods for breast cancer, thereby facilitating\na more seamless integration of advanced AI technologies within clinical\nsettings. By enhancing the interpretability of AI-driven decisions, this work\nlays the groundwork for improved collaboration between AI systems and medical\npractitioners, ultimately enriching patient care. Furthermore, the implications\nof our research extend well beyond the current methodologies, advocating for\nsubsequent inquiries into the integration of multimodal data and the refinement\nof AI explanations to satisfy the needs of clinical practice.\n","authors":["Maryam Ahmed","Tooba Bibi","Rizwan Ahmed Khan","Sidra Nasir"],"pdf_url":"https://arxiv.org/pdf/2404.03892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03052v2","updated":"2024-04-05T04:33:23Z","published":"2023-12-05T18:58:37Z","title":"Visual Program Distillation: Distilling Tools and Programmatic Reasoning\n into Vision-Language Models","summary":" Solving complex visual tasks such as \"Who invented the musical instrument on\nthe right?\" involves a composition of skills: understanding space, recognizing\ninstruments, and also retrieving prior knowledge. Recent work shows promise by\ndecomposing such tasks using a large language model (LLM) into an executable\nprogram that invokes specialized vision models. However, generated programs are\nerror-prone: they omit necessary steps, include spurious ones, and are unable\nto recover when the specialized models give incorrect outputs. Moreover, they\nrequire loading multiple models, incurring high latency and computation costs.\nWe propose Visual Program Distillation (VPD), an instruction tuning framework\nthat produces a vision-language model (VLM) capable of solving complex visual\ntasks with a single forward pass. VPD distills the reasoning ability of LLMs by\nusing them to sample multiple candidate programs, which are then executed and\nverified to identify a correct one. It translates each correct program into a\nlanguage description of the reasoning steps, which are then distilled into a\nVLM. Extensive experiments show that VPD improves the VLM's ability to count,\nunderstand spatial relations, and reason compositionally. Our VPD-trained\nPaLI-X outperforms all prior VLMs, achieving state-of-the-art performance\nacross complex vision tasks, including MMBench, OK-VQA, A-OKVQA, TallyQA, POPE,\nand Hateful Memes. An evaluation with human annotators also confirms that VPD\nimproves model response factuality and consistency. Finally, experiments on\ncontent moderation demonstrate that VPD is also helpful for adaptation to\nreal-world applications with limited data.\n","authors":["Yushi Hu","Otilia Stretcu","Chun-Ta Lu","Krishnamurthy Viswanathan","Kenji Hata","Enming Luo","Ranjay Krishna","Ariel Fuxman"],"pdf_url":"https://arxiv.org/pdf/2312.03052v2.pdf","comment":"CVPR 2024 Oral"},{"id":"http://arxiv.org/abs/2404.03883v1","updated":"2024-04-05T04:11:31Z","published":"2024-04-05T04:11:31Z","title":"LiDAR-Guided Cross-Attention Fusion for Hyperspectral Band Selection and\n Image Classification","summary":" The fusion of hyperspectral and LiDAR data has been an active research topic.\nExisting fusion methods have ignored the high-dimensionality and redundancy\nchallenges in hyperspectral images, despite that band selection methods have\nbeen intensively studied for hyperspectral image (HSI) processing. This paper\naddresses this significant gap by introducing a cross-attention mechanism from\nthe transformer architecture for the selection of HSI bands guided by LiDAR\ndata. LiDAR provides high-resolution vertical structural information, which can\nbe useful in distinguishing different types of land cover that may have similar\nspectral signatures but different structural profiles. In our approach, the\nLiDAR data are used as the \"query\" to search and identify the \"key\" from the\nHSI to choose the most pertinent bands for LiDAR. This method ensures that the\nselected HSI bands drastically reduce redundancy and computational requirements\nwhile working optimally with the LiDAR data. Extensive experiments have been\nundertaken on three paired HSI and LiDAR data sets: Houston 2013, Trento and\nMUUFL. The results highlight the superiority of the cross-attention mechanism,\nunderlining the enhanced classification accuracy of the identified HSI bands\nwhen fused with the LiDAR features. The results also show that the use of fewer\nbands combined with LiDAR surpasses the performance of state-of-the-art fusion\nmodels.\n","authors":["Judy X Yang","Jun Zhou","Jing Wang","Hui Tian","Wee Chung Liew"],"pdf_url":"https://arxiv.org/pdf/2404.03883v1.pdf","comment":"15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.03876v1","updated":"2024-04-05T03:51:19Z","published":"2024-04-05T03:51:19Z","title":"Increasing Fairness in Classification of Out of Distribution Data for\n Facial Recognition","summary":" Standard classification theory assumes that the distribution of images in the\ntest and training sets are identical. Unfortunately, real-life scenarios\ntypically feature unseen data (\"out-of-distribution data\") which is different\nfrom data in the training distribution(\"in-distribution\"). This issue is most\nprevalent in social justice problems where data from under-represented groups\nmay appear in the test data without representing an equal proportion of the\ntraining data. This may result in a model returning confidently wrong decisions\nand predictions. We are interested in the following question: Can the\nperformance of a neural network improve on facial images of out-of-distribution\ndata when it is trained simultaneously on multiple datasets of in-distribution\ndata? We approach this problem by incorporating the Outlier Exposure model and\ninvestigate how the model's performance changes when other datasets of facial\nimages were implemented. We observe that the accuracy and other metrics of the\nmodel can be increased by applying Outlier Exposure, incorporating a trainable\nweight parameter to increase the machine's emphasis on outlier images, and by\nre-weighting the importance of different class labels. We also experimented\nwith whether sorting the images and determining outliers via image features\nwould have more of an effect on the metrics than sorting by average pixel\nvalue. Our goal was to make models not only more accurate but also more fair by\nscanning a more expanded range of images. We also tested the datasets in\nreverse order to see whether a more fair dataset with balanced features has an\neffect on the model's accuracy.\n","authors":["Gianluca Barone","Aashrit Cunchala","Rudy Nunez"],"pdf_url":"https://arxiv.org/pdf/2404.03876v1.pdf","comment":"18 pages, 6 tables, 6 figures"},{"id":"http://arxiv.org/abs/2306.00003v3","updated":"2024-04-05T03:25:04Z","published":"2023-05-25T18:22:12Z","title":"Detecting Heart Disease from Multi-View Ultrasound Images via Supervised\n Attention Multiple Instance Learning","summary":" Aortic stenosis (AS) is a degenerative valve condition that causes\nsubstantial morbidity and mortality. This condition is under-diagnosed and\nunder-treated. In clinical practice, AS is diagnosed with expert review of\ntransthoracic echocardiography, which produces dozens of ultrasound images of\nthe heart. Only some of these views show the aortic valve. To automate\nscreening for AS, deep networks must learn to mimic a human expert's ability to\nidentify views of the aortic valve then aggregate across these relevant images\nto produce a study-level diagnosis. We find previous approaches to AS detection\nyield insufficient accuracy due to relying on inflexible averages across\nimages. We further find that off-the-shelf attention-based multiple instance\nlearning (MIL) performs poorly. We contribute a new end-to-end MIL approach\nwith two key methodological innovations. First, a supervised attention\ntechnique guides the learned attention mechanism to favor relevant views.\nSecond, a novel self-supervised pretraining strategy applies contrastive\nlearning on the representation of the whole study instead of individual images\nas commonly done in prior literature. Experiments on an open-access dataset and\nan external validation set show that our approach yields higher accuracy while\nreducing model size.\n","authors":["Zhe Huang","Benjamin S. Wessler","Michael C. Hughes"],"pdf_url":"https://arxiv.org/pdf/2306.00003v3.pdf","comment":"Echocardiogram; multiple-instance learning; self-supervised learning;\n semi-supervised learning; medical imaging"},{"id":"http://arxiv.org/abs/2404.01655v2","updated":"2024-04-05T03:15:11Z","published":"2024-04-02T05:56:17Z","title":"FashionEngine: Interactive Generation and Editing of 3D Clothed Humans","summary":" We present FashionEngine, an interactive 3D human generation and editing\nsystem that allows us to design 3D digital humans in a way that aligns with how\nhumans interact with the world, such as natural languages, visual perceptions,\nand hand-drawing. FashionEngine automates the 3D human production with three\nkey components: 1) A pre-trained 3D human diffusion model that learns to model\n3D humans in a semantic UV latent space from 2D image training data, which\nprovides strong priors for diverse generation and editing tasks. 2)\nMultimodality-UV Space encoding the texture appearance, shape topology, and\ntextual semantics of human clothing in a canonical UV-aligned space, which\nfaithfully aligns the user multimodal inputs with the implicit UV latent space\nfor controllable 3D human editing. The multimodality-UV space is shared across\ndifferent user inputs, such as texts, images, and sketches, which enables\nvarious joint multimodal editing tasks. 3) Multimodality-UV Aligned Sampler\nlearns to sample high-quality and diverse 3D humans from the diffusion prior\nfor multimodal user inputs. Extensive experiments validate FashionEngine's\nstate-of-the-art performance for conditional generation/editing tasks. In\naddition, we present an interactive user interface for our FashionEngine that\nenables both conditional and unconditional generation tasks, and editing tasks\nincluding pose/view/shape control, text-, image-, and sketch-driven 3D human\nediting and 3D virtual try-on, in a unified framework. Our project page is at:\nhttps://taohuumd.github.io/projects/FashionEngine.\n","authors":["Tao Hu","Fangzhou Hong","Zhaoxi Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2404.01655v2.pdf","comment":"Project Page: https://taohuumd.github.io/projects/FashionEngine"},{"id":"http://arxiv.org/abs/2403.12686v3","updated":"2024-04-05T02:34:01Z","published":"2024-03-19T12:45:18Z","title":"WaterVG: Waterway Visual Grounding based on Text-Guided Vision and\n mmWave Radar","summary":" The perception of waterways based on human intent is significant for\nautonomous navigation and operations of Unmanned Surface Vehicles (USVs) in\nwater environments. Inspired by visual grounding, we introduce WaterVG, the\nfirst visual grounding dataset designed for USV-based waterway perception based\non human prompts. WaterVG encompasses prompts describing multiple targets, with\nannotations at the instance level including bounding boxes and masks. Notably,\nWaterVG includes 11,568 samples with 34,987 referred targets, whose prompts\nintegrates both visual and radar characteristics. The pattern of text-guided\ntwo sensors equips a finer granularity of text prompts with visual and radar\nfeatures of referred targets. Moreover, we propose a low-power visual grounding\nmodel, Potamoi, which is a multi-task model with a well-designed Phased\nHeterogeneous Modality Fusion (PHMF) mode, including Adaptive Radar Weighting\n(ARW) and Multi-Head Slim Cross Attention (MHSCA). Exactly, ARW extracts\nrequired radar features to fuse with vision for prompt alignment. MHSCA is an\nefficient fusion module with a remarkably small parameter count and FLOPs,\nelegantly fusing scenario context captured by two sensors with linguistic\nfeatures, which performs expressively on visual grounding tasks. Comprehensive\nexperiments and evaluations have been conducted on WaterVG, where our Potamoi\narchives state-of-the-art performances compared with counterparts.\n","authors":["Runwei Guan","Liye Jia","Fengyufan Yang","Shanliang Yao","Erick Purwanto","Xiaohui Zhu","Eng Gee Lim","Jeremy Smith","Ka Lok Man","Xuming Hu","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2403.12686v3.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.03854v1","updated":"2024-04-05T01:17:25Z","published":"2024-04-05T01:17:25Z","title":"Mitigating Heterogeneity in Federated Multimodal Learning with\n Biomedical Vision-Language Pre-training","summary":" Vision-language pre-training (VLP) has arised as an efficient scheme for\nmultimodal representation learning, but it requires large-scale multimodal data\nfor pre-training, making it an obstacle especially for biomedical applications.\nTo overcome the data limitation, federated learning (FL) can be a promising\nstrategy to scale up the dataset for biomedical VLP while protecting data\nprivacy. However, client data are often heterogeneous in real-world scenarios,\nand we observe that local training on heterogeneous client data would distort\nthe multimodal representation learning and lead to biased cross-modal\nalignment. To address this challenge, we propose Federated distributional\nRobust Guidance-Based (FedRGB) learning framework for federated VLP with\nrobustness to data heterogeneity. Specifically, we utilize a guidance-based\nlocal training scheme to reduce feature distortions, and employ a\ndistribution-based min-max optimization to learn unbiased cross-modal\nalignment. The experiments on real-world datasets show our method successfully\npromotes efficient federated multimodal learning for biomedical VLP with data\nheterogeneity.\n","authors":["Zitao Shuai","Liyue Shen"],"pdf_url":"https://arxiv.org/pdf/2404.03854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05061v2","updated":"2024-04-05T00:43:16Z","published":"2024-03-08T05:15:48Z","title":"RadarDistill: Boosting Radar-based Object Detection Performance via\n Knowledge Distillation from LiDAR Features","summary":" The inherent noisy and sparse characteristics of radar data pose challenges\nin finding effective representations for 3D object detection. In this paper, we\npropose RadarDistill, a novel knowledge distillation (KD) method, which can\nimprove the representation of radar data by leveraging LiDAR data. RadarDistill\nsuccessfully transfers desirable characteristics of LiDAR features into radar\nfeatures using three key components: Cross-Modality Alignment (CMA),\nActivation-based Feature Distillation (AFD), and Proposal-based Feature\nDistillation (PFD). CMA enhances the density of radar features by employing\nmultiple layers of dilation operations, effectively addressing the challenge of\ninefficient knowledge transfer from LiDAR to radar. AFD selectively transfers\nknowledge based on regions of the LiDAR features, with a specific focus on\nareas where activation intensity exceeds a predefined threshold. PFD similarly\nguides the radar network to selectively mimic features from the LiDAR network\nwithin the object proposals. Our comparative analyses conducted on the nuScenes\ndatasets demonstrate that RadarDistill achieves state-of-the-art (SOTA)\nperformance for radar-only object detection task, recording 20.5% in mAP and\n43.7% in NDS. Also, RadarDistill significantly improves the performance of the\ncamera-radar fusion model.\n","authors":["Geonho Bang","Kwangjin Choi","Jisong Kim","Dongsuk Kum","Jun Won Choi"],"pdf_url":"https://arxiv.org/pdf/2403.05061v2.pdf","comment":"Accepted to IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR) 2024, 10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.00498v2","updated":"2024-04-05T00:09:00Z","published":"2024-03-30T23:42:23Z","title":"94% on CIFAR-10 in 3.29 Seconds on a Single GPU","summary":" CIFAR-10 is among the most widely used datasets in machine learning,\nfacilitating thousands of research projects per year. To accelerate research\nand reduce the cost of experiments, we introduce training methods for CIFAR-10\nwhich reach 94% accuracy in 3.29 seconds, 95% in 10.4 seconds, and 96% in 46.3\nseconds, when run on a single NVIDIA A100 GPU. As one factor contributing to\nthese training speeds, we propose a derandomized variant of horizontal flipping\naugmentation, which we show improves over the standard method in every case\nwhere flipping is beneficial over no flipping at all. Our code is released at\nhttps://github.com/KellerJordan/cifar10-airbench.\n","authors":["Keller Jordan"],"pdf_url":"https://arxiv.org/pdf/2404.00498v2.pdf","comment":null}]},"2024-04-08T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2404.05695v1","updated":"2024-04-08T17:26:28Z","published":"2024-04-08T17:26:28Z","title":"Humanoid-Gym: Reinforcement Learning for Humanoid Robot with Zero-Shot\n Sim2Real Transfer","summary":" Humanoid-Gym is an easy-to-use reinforcement learning (RL) framework based on\nNvidia Isaac Gym, designed to train locomotion skills for humanoid robots,\nemphasizing zero-shot transfer from simulation to the real-world environment.\nHumanoid-Gym also integrates a sim-to-sim framework from Isaac Gym to Mujoco\nthat allows users to verify the trained policies in different physical\nsimulations to ensure the robustness and generalization of the policies. This\nframework is verified by RobotEra's XBot-S (1.2-meter tall humanoid robot) and\nXBot-L (1.65-meter tall humanoid robot) in a real-world environment with\nzero-shot sim-to-real transfer. The project website and source code can be\nfound at: https://sites.google.com/view/humanoid-gym/.\n","authors":["Xinyang Gu","Yen-Jen Wang","Jianyu Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12720v2","updated":"2024-04-08T16:13:24Z","published":"2024-03-19T13:29:44Z","title":"Shared Autonomy via Variable Impedance Control and Virtual Potential\n Fields for Encoding Human Demonstration","summary":" This article introduces a framework for complex human-robot collaboration\ntasks, such as the co-manufacturing of furniture. For these tasks, it is\nessential to encode tasks from human demonstration and reproduce these skills\nin a compliant and safe manner. Therefore, two key components are addressed in\nthis work: motion generation and shared autonomy. We propose a motion generator\nbased on a time-invariant potential field, capable of encoding wrench profiles,\ncomplex and closed-loop trajectories, and additionally incorporates obstacle\navoidance. Additionally, the paper addresses shared autonomy (SA) which enables\nsynergetic collaboration between human operators and robots by dynamically\nallocating authority. Variable impedance control (VIC) and force control are\nemployed, where impedance and wrench are adapted based on the human-robot\nautonomy factor derived from interaction forces. System passivity is ensured by\nan energy-tank based task passivation strategy. The framework's efficacy is\nvalidated through simulations and an experimental study employing a Franka\nEmika Research 3 robot. More information can be found on the project website\nhttps://shailjadav.github.io/SALADS/\n","authors":["Shail Jadav","Johannes Heidersberger","Christian Ott","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2403.12720v2.pdf","comment":"Accepted to ICRA 2024. More information can be found on the project\n website https://shailjadav.github.io/SALADS/"},{"id":"http://arxiv.org/abs/2404.05627v1","updated":"2024-04-08T15:59:47Z","published":"2024-04-08T15:59:47Z","title":"OtterROS: Picking and Programming an Uncrewed Surface Vessel for\n Experimental Field Robotics Research with ROS 2","summary":" There exist a wide range of options for field robotics research using ground\nand aerial mobile robots, but there are comparatively few robust and\nresearch-ready uncrewed surface vessels (USVs). This workshop paper starts with\na snapshot of USVs currently available to the research community and then\ndescribes \"OtterROS\", an open source ROS 2 solution for the Otter USV. Field\nexperiments using OtterROS are described, which highlight the utility of the\nOtter USV and the benefits of using ROS 2 in aquatic robotics research. For\nthose interested in USV research, the paper details recommended hardware to run\nOtterROS and includes an example ROS 2 package using OtterROS, removing\nunnecessary non-recurring engineering from field robotics research activities.\n","authors":["Thomas M. C. Sears","M. Riley Cooper","Sabrina R. Button","Joshua A. Marshall"],"pdf_url":"https://arxiv.org/pdf/2404.05627v1.pdf","comment":"8 pages, 6 figures. Complete package details at\n https://github.com/offroad-robotics/otter_ros. Submitted to Workshop on Field\n Robotics at ICRA 2024"},{"id":"http://arxiv.org/abs/2404.05625v1","updated":"2024-04-08T15:57:31Z","published":"2024-04-08T15:57:31Z","title":"Robust Control using Control Lyapunov Function and Hamilton-Jacobi\n Reachability","summary":" The paper presents a robust control technique that combines the Control\nLyapunov function and Hamilton-Jacobi Reachability to compute a controller and\nits Region of Attraction (ROA). The Control Lyapunov function uses a linear\nsystem model with an assumed additive uncertainty to calculate a control gain\nand the level sets of the ROA as a function of the uncertainty. Next,\nHamilton-Jacobi reachability uses the nonlinear model with the modeled\nuncertainty, which need not be additive, to compute the backward reachable set\n(BRS). Finally, by juxtaposing the level sets of the ROA with BRS, we can\ncalculate the worst-case additive disturbance and the ROA of the nonlinear\nmodel. We illustrate our approach on a 2D quadcopter tracking trajectory and a\n2D quadcopter with height and velocity regulation in simulation.\n","authors":["Chun-Ming Yang","Pranav A. Bhounsule"],"pdf_url":"https://arxiv.org/pdf/2404.05625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07301v2","updated":"2024-04-08T15:47:20Z","published":"2023-08-14T17:39:44Z","title":"A Unified Masked Autoencoder with Patchified Skeletons for Motion\n Synthesis","summary":" The synthesis of human motion has traditionally been addressed through\ntask-dependent models that focus on specific challenges, such as predicting\nfuture motions or filling in intermediate poses conditioned on known key-poses.\nIn this paper, we present a novel task-independent model called UNIMASK-M,\nwhich can effectively address these challenges using a unified architecture.\nOur model obtains comparable or better performance than the state-of-the-art in\neach field. Inspired by Vision Transformers (ViTs), our UNIMASK-M model\ndecomposes a human pose into body parts to leverage the spatio-temporal\nrelationships existing in human motion. Moreover, we reformulate various\npose-conditioned motion synthesis tasks as a reconstruction problem with\ndifferent masking patterns given as input. By explicitly informing our model\nabout the masked joints, our UNIMASK-M becomes more robust to occlusions.\nExperimental results show that our model successfully forecasts human motion on\nthe Human3.6M dataset. Moreover, it achieves state-of-the-art results in motion\ninbetweening on the LaFAN1 dataset, particularly in long transition periods.\nMore information can be found on the project website\nhttps://evm7.github.io/UNIMASKM-page/\n","authors":["Esteve Valls Mascaro","Hyemin Ahn","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2308.07301v2.pdf","comment":"Accepted to AAAI2024. Webpage: https://evm7.github.io/UNIMASKM-page/"},{"id":"http://arxiv.org/abs/2309.16524v2","updated":"2024-04-08T15:46:09Z","published":"2023-09-28T15:34:49Z","title":"HOI4ABOT: Human-Object Interaction Anticipation for Human Intention\n Reading Collaborative roBOTs","summary":" Robots are becoming increasingly integrated into our lives, assisting us in\nvarious tasks. To ensure effective collaboration between humans and robots, it\nis essential that they understand our intentions and anticipate our actions. In\nthis paper, we propose a Human-Object Interaction (HOI) anticipation framework\nfor collaborative robots. We propose an efficient and robust transformer-based\nmodel to detect and anticipate HOIs from videos. This enhanced anticipation\nempowers robots to proactively assist humans, resulting in more efficient and\nintuitive collaborations. Our model outperforms state-of-the-art results in HOI\ndetection and anticipation in VidHOI dataset with an increase of 1.76% and\n1.04% in mAP respectively while being 15.4 times faster. We showcase the\neffectiveness of our approach through experimental results in a real robot,\ndemonstrating that the robot's ability to anticipate HOIs is key for better\nHuman-Robot Interaction. More information can be found on our project webpage:\nhttps://evm7.github.io/HOI4ABOT_page/\n","authors":["Esteve Valls Mascaro","Daniel Sliwowski","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2309.16524v2.pdf","comment":"Proceedings in Conference on Robot Learning 2023. Webpage:\n https://evm7.github.io/HOI4ABOT_page/"},{"id":"http://arxiv.org/abs/2309.05310v3","updated":"2024-04-08T15:44:31Z","published":"2023-09-11T08:55:04Z","title":"ImitationNet: Unsupervised Human-to-Robot Motion Retargeting via Shared\n Latent Space","summary":" This paper introduces a novel deep-learning approach for human-to-robot\nmotion retargeting, enabling robots to mimic human poses accurately. Contrary\nto prior deep-learning-based works, our method does not require paired\nhuman-to-robot data, which facilitates its translation to new robots. First, we\nconstruct a shared latent space between humans and robots via adaptive\ncontrastive learning that takes advantage of a proposed cross-domain similarity\nmetric between the human and robot poses. Additionally, we propose a\nconsistency term to build a common latent space that captures the similarity of\nthe poses with precision while allowing direct robot motion control from the\nlatent space. For instance, we can generate in-between motion through simple\nlinear interpolation between two projected human poses. We conduct a\ncomprehensive evaluation of robot control from diverse modalities (i.e., texts,\nRGB videos, and key poses), which facilitates robot control for non-expert\nusers. Our model outperforms existing works regarding human-to-robot\nretargeting in terms of efficiency and precision. Finally, we implemented our\nmethod in a real robot with self-collision avoidance through a whole-body\ncontroller to showcase the effectiveness of our approach. More information on\nour website https://evm7.github.io/UnsH2R/\n","authors":["Yashuai Yan","Esteve Valls Mascaro","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2309.05310v3.pdf","comment":"Accepted to Humanoids 2023. Website: https://evm7.github.io/UnsH2R/"},{"id":"http://arxiv.org/abs/2402.04768v2","updated":"2024-04-08T15:43:14Z","published":"2024-02-07T11:37:14Z","title":"Robot Interaction Behavior Generation based on Social Motion Forecasting\n for Human-Robot Interaction","summary":" Integrating robots into populated environments is a complex challenge that\nrequires an understanding of human social dynamics. In this work, we propose to\nmodel social motion forecasting in a shared human-robot representation space,\nwhich facilitates us to synthesize robot motions that interact with humans in\nsocial scenarios despite not observing any robot in the motion training. We\ndevelop a transformer-based architecture called ECHO, which operates in the\naforementioned shared space to predict the future motions of the agents\nencountered in social scenarios. Contrary to prior works, we reformulate the\nsocial motion problem as the refinement of the predicted individual motions\nbased on the surrounding agents, which facilitates the training while allowing\nfor single-motion forecasting when only one human is in the scene. We evaluate\nour model in multi-person and human-robot motion forecasting tasks and obtain\nstate-of-the-art performance by a large margin while being efficient and\nperforming in real-time. Additionally, our qualitative results showcase the\neffectiveness of our approach in generating human-robot interaction behaviors\nthat can be controlled via text commands. Webpage: https://evm7.github.io/ECHO/\n","authors":["Esteve Valls Mascaro","Yashuai Yan","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2402.04768v2.pdf","comment":"Accepted at ICRA 2024. Webpage: https://evm7.github.io/ECHO/"},{"id":"http://arxiv.org/abs/2404.05582v1","updated":"2024-04-08T14:57:16Z","published":"2024-04-08T14:57:16Z","title":"Learning Prehensile Dexterity by Imitating and Emulating State-only\n Observations","summary":" When humans learn physical skills (e.g., learn to play tennis), we tend to\nfirst observe and learn what an expert is doing. But this is often\ninsufficient. Therefore, we subsequently engage in practice, where we try to\nemulate the expert. Inspired by this observation, we introduce Combining\nIMitation and Emulation for Motion Refinement (CIMER) -- a two-stage framework\nto learn dexterous prehensile manipulation skills from state-only observations.\nCIMER's first stage involves imitation: simultaneously encode the complex\ninterdependent motions of the robot hand and the object in a structured\ndynamical system. This results in a reactive motion generation policy that\nprovides a reasonable motion prior, but lacks the ability to reason about\ncontact effects due to the lack of action labels. The second stage involves\nemulation: learn a motion refinement policy to make adjustments to the motion\nprior of the robot hand such that the desired object motion is reenacted. CIMER\nis both task-agnostic (no task-specific reward design or shaping) and\nintervention-free (no need for additional teleoperated or labeled\ndemonstrations). Detailed experiments reveal that i) Imitation alone is\ninsufficient, but adding emulation drastically improves performance, ii) CIMER\noutperforms existing methods in terms of sample efficiency and the ability to\ngenerate realistic and stable motions, iii) CIMER can either zero-shot\ngeneralize or learn to adapt to novel objects from the YCB dataset, even\noutperforming expert policies trained with action labels in most cases.\n","authors":["Yunhai Han","Zhenyang Chen","Harish Ravichandar"],"pdf_url":"https://arxiv.org/pdf/2404.05582v1.pdf","comment":"Under review by RA-L"},{"id":"http://arxiv.org/abs/2404.05581v1","updated":"2024-04-08T14:56:56Z","published":"2024-04-08T14:56:56Z","title":"Design and Simulation of Time-energy Optimal Anti-swing Trajectory\n Planner for Autonomous Tower Cranes","summary":" For autonomous crane lifting, optimal trajectories of the crane are required\nas reference inputs to the crane controller to facilitate feedforward control.\nReducing the unactuated payload motion is a crucial issue for under-actuated\ntower cranes with spherical pendulum dynamics. The planned trajectory should be\noptimal in terms of both operating time and energy consumption, to facilitate\noptimum output spending optimum effort. This article proposes an anti-swing\ntower crane trajectory planner that can provide time-energy optimal solutions\nfor the Computer-Aided Lift Planning (CALP) system developed at Nanyang\nTechnological University, which facilitates collision-free lifting path\nplanning of robotized tower cranes in autonomous construction sites. The\ncurrent work introduces a trajectory planning module to the system that\nutilizes the geometric outputs from the path planning module and optimally\nscales them with time information. Firstly, analyzing the non-linear dynamics\nof the crane operations, the tower crane is established as differentially flat.\nSubsequently, the multi-objective trajectory optimization problems for all the\ncrane operations are formulated in the flat output space through consideration\nof the mechanical and safety constraints. Two multi-objective evolutionary\nalgorithms, namely Non-dominated Sorting Genetic Algorithm (NSGA-II) and\nGeneralized Differential Evolution 3 (GDE3), are extensively compared via\nstatistical measures based on the closeness of solutions to the Pareto front,\ndistribution of solutions in the solution space and the runtime, to select the\noptimization engine of the planner. Finally, the crane operation trajectories\nare obtained via the corresponding planned flat output trajectories. Studies\nsimulating real-world lifting scenarios are conducted to verify the\neffectiveness and reliability of the proposed module of the lift planning\nsystem.\n","authors":["Souravik Dutta","Yiyu Cai"],"pdf_url":"https://arxiv.org/pdf/2404.05581v1.pdf","comment":"18 pages, 12 figures, 9 tables"},{"id":"http://arxiv.org/abs/2309.12089v2","updated":"2024-04-08T14:39:56Z","published":"2023-09-21T13:58:26Z","title":"HiCRISP: An LLM-based Hierarchical Closed-Loop Robotic Intelligent\n Self-Correction Planner","summary":" The integration of Large Language Models (LLMs) into robotics has\nrevolutionized human-robot interactions and autonomous task planning. However,\nthese systems are often unable to self-correct during the task execution, which\nhinders their adaptability in dynamic real-world environments. To address this\nissue, we present a Hierarchical Closed-loop Robotic Intelligent\nSelf-correction Planner (HiCRISP), an innovative framework that enables robots\nto correct errors within individual steps during the task execution. HiCRISP\nactively monitors and adapts the task execution process, addressing both\nhigh-level planning and low-level action errors. Extensive benchmark\nexperiments, encompassing virtual and real-world scenarios, showcase HiCRISP's\nexceptional performance, positioning it as a promising solution for robotic\ntask planning with LLMs.\n","authors":["Chenlin Ming","Jiacheng Lin","Pangkit Fong","Han Wang","Xiaoming Duan","Jianping He"],"pdf_url":"https://arxiv.org/pdf/2309.12089v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04729v2","updated":"2024-04-08T14:30:36Z","published":"2024-03-07T18:31:32Z","title":"Stretchable Pneumatic Sleeve for Adaptable, Low-Displacement Anchoring\n in Exosuits","summary":" Despite recent advances in wearable technology, interfacing movement\nassistance devices with the human body remains challenging. We present a\nstretchable pneumatic sleeve that can anchor an exosuit actuator to the human\narm with a low displacement of the actuator's mounting point relative to the\nbody during operation. Our sleeve has the potential to serve as an adaptable\nattachment mechanism for exosuits, since it can adjust its pressure to only\ncompress the arm as much as needed to transmit the applied exosuit forces\nwithout a large displacement. We discuss the design of our sleeve, which is\nmade of fabric pneumatic artificial muscle (fPAM) actuators formed into bands.\nWe quantify the performance of nine fPAM bands of various lengths and widths,\nas well as three sleeves (an fPAM sleeve, a series pouch motor (SPM) sleeve as\nin previous literature, and an off the shelf hook and loop sleeve), through the\nmeasurement of the compressing force as a function of pressure and the\nlocalized pulling force that can be resisted as a function of both pressure and\nmounting point displacement. Our experimental results show that fPAM bands with\nsmaller resting length and/or larger resting width produce higher forces. Also,\nwhen inflated, an fPAM sleeve that has equivalent dimensions to the SPM sleeve\nwhile fully stretched has similar performance to the SPM sleeve. While\ninflated, both pneumatic sleeves decrease the mounting point displacement\ncompared to the hook and loop sleeve. Compared to the SPM sleeve, the fPAM\nsleeve is able to hold larger internal pressure before bursting, increasing its\npossible force range. Also, when not inflated, the fPAM sleeve resists the\npulling force well, indicating its ability to provide anchoring when not\nactuated.\n","authors":["Katalin Schaffer","Ultan Fallon","Margaret M. Coad"],"pdf_url":"https://arxiv.org/pdf/2403.04729v2.pdf","comment":"7th IEEE-RAS International Conference on Soft Robotics (RoboSoft\n 2024) Supplementary video: https://youtu.be/9orz3NzMXT4?si=ZCjG72tS_2rSeFhJ"},{"id":"http://arxiv.org/abs/2404.05535v1","updated":"2024-04-08T14:03:33Z","published":"2024-04-08T14:03:33Z","title":"Robust STL Control Synthesis under Maximal Disturbance Sets","summary":" This work addresses maximally robust control synthesis under unknown\ndisturbances. We consider a general nonlinear system, subject to a Signal\nTemporal Logic (STL) specification, and wish to jointly synthesize the maximal\npossible disturbance bounds and the corresponding controllers that ensure the\nSTL specification is satisfied under these bounds. Many works have considered\nSTL satisfaction under given bounded disturbances. Yet, to the authors' best\nknowledge, this is the first work that aims to maximize the permissible\ndisturbance set and find the corresponding controllers that ensure satisfying\nthe STL specification with maximum disturbance robustness. We extend the notion\nof disturbance-robust semantics for STL, which is a property of a\nspecification, dynamical system, and controller, and provide an algorithm to\nget the maximal disturbance robust controllers satisfying an STL specification\nusing Hamilton-Jacobi reachability. We show its soundness and provide a\nsimulation example with an Autonomous Underwater Vehicle (AUV).\n","authors":["Joris Verhagen","Lars Lindemann","Jana Tumova"],"pdf_url":"https://arxiv.org/pdf/2404.05535v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.05505v1","updated":"2024-04-08T13:27:07Z","published":"2024-04-08T13:27:07Z","title":"Taming Transformers for Realistic Lidar Point Cloud Generation","summary":" Diffusion Models (DMs) have achieved State-Of-The-Art (SOTA) results in the\nLidar point cloud generation task, benefiting from their stable training and\niterative refinement during sampling. However, DMs often fail to realistically\nmodel Lidar raydrop noise due to their inherent denoising process. To retain\nthe strength of iterative sampling while enhancing the generation of raydrop\nnoise, we introduce LidarGRIT, a generative model that uses auto-regressive\ntransformers to iteratively sample the range images in the latent space rather\nthan image space. Furthermore, LidarGRIT utilises VQ-VAE to separately decode\nrange images and raydrop masks. Our results show that LidarGRIT achieves\nsuperior performance compared to SOTA models on KITTI-360 and KITTI odometry\ndatasets. Code available at:https://github.com/hamedhaghighi/LidarGRIT.\n","authors":["Hamed Haghighi","Amir Samadi","Mehrdad Dianati","Valentina Donzella","Kurt Debattista"],"pdf_url":"https://arxiv.org/pdf/2404.05505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05452v1","updated":"2024-04-08T12:36:47Z","published":"2024-04-08T12:36:47Z","title":"A Hessian for Gaussian Mixture Likelihoods in Nonlinear Least Squares","summary":" This paper proposes a novel Hessian approximation for Maximum a Posteriori\nestimation problems in robotics involving Gaussian mixture likelihoods. The\nproposed Hessian leads to better convergence properties. Previous approaches\nmanipulate the Gaussian mixture likelihood into a form that allows the problem\nto be represented as a nonlinear least squares (NLS) problem. However, they\nresult in an inaccurate Hessian approximation due to additional nonlinearities\nthat are not accounted for in NLS solvers. The proposed Hessian approximation\nis derived by setting the Hessians of the Gaussian mixture component errors to\nzero, which is the same starting point as for the Gauss-Newton Hessian\napproximation for NLS, and using the chain rule to account for additional\nnonlinearities. The proposed Hessian approximation is more accurate, resulting\nin improved convergence properties that are demonstrated on simulated and\nreal-world experiments. A method to maintain compatibility with existing\nsolvers, such as ceres, is also presented. Accompanying software and\nsupplementary material can be found at\nhttps://github.com/decargroup/hessian_sum_mixtures.\n","authors":["Vassili Korotkine","Mitchell Cohen","James Richard Forbes"],"pdf_url":"https://arxiv.org/pdf/2404.05452v1.pdf","comment":"8 pages, 2 figures. Submitted to IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2404.05444v1","updated":"2024-04-08T12:26:06Z","published":"2024-04-08T12:26:06Z","title":"The Open Autonomy Safety Case Framework","summary":" A system safety case is a compelling, comprehensible, and valid argument\nabout the satisfaction of the safety goals of a given system operating in a\ngiven environment supported by convincing evidence. Since the publication of UL\n4600 in 2020, safety cases have become a best practice for measuring, managing,\nand communicating the safety of autonomous vehicles (AVs). Although UL 4600\nprovides guidance on how to build the safety case for an AV, the complexity of\nAVs and their operating environments, the novelty of the used technology, the\nneed for complying with various regulations and technical standards, and for\naddressing cybersecurity concerns and ethical considerations make the\ndevelopment of safety cases for AVs challenging. To this end, safety case\nframeworks have been proposed that bring strategies, argument templates, and\nother guidance together to support the development of a safety case. This paper\nintroduces the Open Autonomy Safety Case Framework, developed over years of\nwork with the autonomous vehicle industry, as a roadmap for how AVs can be\ndeployed safely and responsibly.\n","authors":["Michael Wagner","Carmen Carlan"],"pdf_url":"https://arxiv.org/pdf/2404.05444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05423v1","updated":"2024-04-08T11:43:40Z","published":"2024-04-08T11:43:40Z","title":"Residual Chain Prediction for Autonomous Driving Path Planning","summary":" In the rapidly evolving field of autonomous driving systems, the refinement\nof path planning algorithms is paramount for navigating vehicles through\ndynamic environments, particularly in complex urban scenarios. Traditional path\nplanning algorithms, which are heavily reliant on static rules and manually\ndefined parameters, often fall short in such contexts, highlighting the need\nfor more adaptive, learning-based approaches. Among these, behavior cloning\nemerges as a noteworthy strategy for its simplicity and efficiency, especially\nwithin the realm of end-to-end path planning. However, behavior cloning faces\nchallenges, such as covariate shift when employing traditional Manhattan\ndistance as the metric. Addressing this, our study introduces the novel concept\nof Residual Chain Loss. Residual Chain Loss dynamically adjusts the loss\ncalculation process to enhance the temporal dependency and accuracy of\npredicted path points, significantly improving the model's performance without\nadditional computational overhead. Through testing on the nuScenes dataset, we\nunderscore the method's substantial advancements in addressing covariate shift,\nfacilitating dynamic loss adjustments, and ensuring seamless integration with\nend-to-end path planning frameworks. Our findings highlight the potential of\nResidual Chain Loss to revolutionize planning component of autonomous driving\nsystems, marking a significant step forward in the quest for level 5 autonomous\ndriving system.\n","authors":["Liguo Zhou","Yirui Zhou","Huaming Liu","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2404.05423v1.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2310.04781v3","updated":"2024-04-08T10:39:34Z","published":"2023-10-07T11:47:58Z","title":"Unifying Foundation Models with Quadrotor Control for Visual Tracking\n Beyond Object Categories","summary":" Visual control enables quadrotors to adaptively navigate using real-time\nsensory data, bridging perception with action. Yet, challenges persist,\nincluding generalization across scenarios, maintaining reliability, and\nensuring real-time responsiveness. This paper introduces a perception framework\ngrounded in foundation models for universal object detection and tracking,\nmoving beyond specific training categories. Integral to our approach is a\nmulti-layered tracker integrated with the foundation detector, ensuring\ncontinuous target visibility, even when faced with motion blur, abrupt light\nshifts, and occlusions. Complementing this, we introduce a model-free\ncontroller tailored for resilient quadrotor visual tracking. Our system\noperates efficiently on limited hardware, relying solely on an onboard camera\nand an inertial measurement unit. Through extensive validation in diverse\nchallenging indoor and outdoor environments, we demonstrate our system's\neffectiveness and adaptability. In conclusion, our research represents a step\nforward in quadrotor visual tracking, moving from task-specific methods to more\nversatile and adaptable operations.\n","authors":["Alessandro Saviolo","Pratyaksh Rao","Vivek Radhakrishnan","Jiuhong Xiao","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2310.04781v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03336v2","updated":"2024-04-08T09:56:28Z","published":"2024-04-04T10:04:44Z","title":"Scaling Population-Based Reinforcement Learning with GPU Accelerated\n Simulation","summary":" In recent years, deep reinforcement learning (RL) has shown its effectiveness\nin solving complex continuous control tasks like locomotion and dexterous\nmanipulation. However, this comes at the cost of an enormous amount of\nexperience required for training, exacerbated by the sensitivity of learning\nefficiency and the policy performance to hyperparameter selection, which often\nrequires numerous trials of time-consuming experiments. This work introduces a\nPopulation-Based Reinforcement Learning (PBRL) approach that exploits a\nGPU-accelerated physics simulator to enhance the exploration capabilities of RL\nby concurrently training multiple policies in parallel. The PBRL framework is\napplied to three state-of-the-art RL algorithms -- PPO, SAC, and DDPG --\ndynamically adjusting hyperparameters based on the performance of learning\nagents. The experiments are performed on four challenging tasks in Isaac Gym --\nAnymal Terrain, Shadow Hand, Humanoid, Franka Nut Pick -- by analyzing the\neffect of population size and mutation mechanisms for hyperparameters. The\nresults show that PBRL agents achieve superior performance, in terms of\ncumulative reward, compared to non-evolutionary baseline agents. The trained\nagents are finally deployed in the real world for a Franka Nut Pick task,\ndemonstrating successful sim-to-real transfer. Code and videos of the learned\npolicies are available on our project website.\n","authors":["Asad Ali Shahid","Yashraj Narang","Vincenzo Petrone","Enrico Ferrentino","Ankur Handa","Dieter Fox","Marco Pavone","Loris Roveda"],"pdf_url":"https://arxiv.org/pdf/2404.03336v2.pdf","comment":"Submitted for publication to IEEE Robotics and Automation Letters\n (RA-L)"},{"id":"http://arxiv.org/abs/2404.05351v1","updated":"2024-04-08T09:38:40Z","published":"2024-04-08T09:38:40Z","title":"Semi-Supervised Novelty Detection for Precise Ultra-Wideband Error\n Signal Prediction","summary":" Ultra-Wideband (UWB) technology is an emerging low-cost solution for\nlocalization in a generic environment. However, UWB signal can be affected by\nsignal reflections and non-line-of-sight (NLoS) conditions between anchors;\nhence, in a broader sense, the specific geometry of the environment and the\ndisposition of obstructing elements in the map may drastically hinder the\nreliability of UWB for precise robot localization. This work aims to mitigate\nthis problem by learning a map-specific characterization of the UWB quality\nsignal with a fingerprint semi-supervised novelty detection methodology. An\nunsupervised autoencoder neural network is trained on nominal UWB map\nconditions, and then it is used to predict errors derived from the introduction\nof perturbing novelties in the environment. This work poses a step change in\nthe understanding of UWB localization and its reliability in evolving\nenvironmental conditions. The resulting performance of the proposed method is\nproved by fine-grained experiments obtained with a visual tracking ground\ntruth.\n","authors":["Umberto Albertin","Alessandro Navone","Mauro Martini","Marcello Chiaberge"],"pdf_url":"https://arxiv.org/pdf/2404.05351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05343v1","updated":"2024-04-08T09:29:34Z","published":"2024-04-08T09:29:34Z","title":"Non-linear Model Predictive Control for Multi-task GPS-free Autonomous\n Navigation in Vineyards","summary":" Autonomous navigation is the foundation of agricultural robots. This paper\nfocuses on developing an advanced autonomous navigation system for a rover\noperating within row-based crops. A position-agnostic system is proposed to\naddress the challenging situation when standard localization methods, like GPS,\nfail due to unfavorable weather or obstructed signals. This breakthrough is\nespecially vital in densely vegetated regions, including areas covered by thick\ntree canopies or pergola vineyards. This work proposed a novel system that\nleverages a single RGB-D camera and a Non-linear Model Predictive Control\nstrategy to navigate through entire rows, adapting to various crop spacing. The\npresented solution demonstrates versatility in handling diverse crop densities,\nenvironmental factors, and multiple navigation tasks to support agricultural\nactivities at an extremely cost-effective implementation. Experimental\nvalidation in simulated and real vineyards underscores the system's robustness\nand competitiveness in both standard row traversal and target objects approach.\n","authors":["Matteo Sperti","Marco Ambrosio","Mauro Martini","Alessandro Navone","Andrea Ostuni","Marcello Chiaberge"],"pdf_url":"https://arxiv.org/pdf/2404.05343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05338v1","updated":"2024-04-08T09:26:31Z","published":"2024-04-08T09:26:31Z","title":"GPS-free Autonomous Navigation in Cluttered Tree Rows with Deep Semantic\n Segmentation","summary":" Segmentation-based autonomous navigation has recently been presented as an\nappealing approach to guiding robotic platforms through crop rows without\nrequiring perfect GPS localization. Nevertheless, current techniques are\nrestricted to situations where the distinct separation between the plants and\nthe sky allows for the identification of the row's center. However, tall, dense\nvegetation, such as high tree rows and orchards, is the primary cause of GPS\nsignal blockage. In this study, we increase the overall robustness and\nadaptability of the control algorithm by extending the segmentation-based\nrobotic guiding to those cases where canopies and branches occlude the sky and\nprevent the utilization of GPS and earlier approaches. An efficient Deep Neural\nNetwork architecture has been used to address semantic segmentation, performing\nthe training with synthetic data only. Numerous vineyards and tree fields have\nundergone extensive testing in both simulation and real-world to show the\nsolution's competitive benefits.\n","authors":["Alessandro Navone","Mauro Martini","Marco Ambrosio","Andrea Ostuni","Simone Angarano","Marcello Chiaberge"],"pdf_url":"https://arxiv.org/pdf/2404.05338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05318v1","updated":"2024-04-08T09:08:59Z","published":"2024-04-08T09:08:59Z","title":"Stochastic Online Optimization for Cyber-Physical and Robotic Systems","summary":" We propose a novel gradient-based online optimization framework for solving\nstochastic programming problems that frequently arise in the context of\ncyber-physical and robotic systems. Our problem formulation accommodates\nconstraints that model the evolution of a cyber-physical system, which has, in\ngeneral, a continuous state and action space, is nonlinear, and where the state\nis only partially observed. We also incorporate an approximate model of the\ndynamics as prior knowledge into the learning process and show that even rough\nestimates of the dynamics can significantly improve the convergence of our\nalgorithms. Our online optimization framework encompasses both gradient descent\nand quasi-Newton methods, and we provide a unified convergence analysis of our\nalgorithms in a non-convex setting. We also characterize the impact of modeling\nerrors in the system dynamics on the convergence rate of the algorithms.\nFinally, we evaluate our algorithms in simulations of a flexible beam, a\nfour-legged walking robot, and in real-world experiments with a ping-pong\nplaying robot.\n","authors":["Hao Ma","Melanie Zeilinger","Michael Muehlebach"],"pdf_url":"https://arxiv.org/pdf/2404.05318v1.pdf","comment":"46 pages, 16 figures"},{"id":"http://arxiv.org/abs/2404.05309v1","updated":"2024-04-08T08:57:32Z","published":"2024-04-08T08:57:32Z","title":"CLIPping the Limits: Finding the Sweet Spot for Relevant Images in\n Automated Driving Systems Perception Testing","summary":" Perception systems, especially cameras, are the eyes of automated driving\nsystems. Ensuring that they function reliably and robustly is therefore an\nimportant building block in the automation of vehicles. There are various\napproaches to test the perception of automated driving systems. Ultimately,\nhowever, it always comes down to the investigation of the behavior of\nperception systems under specific input data. Camera images are a crucial part\nof the input data. Image data sets are therefore collected for the testing of\nautomated driving systems, but it is non-trivial to find specific images in\nthese data sets. Thanks to recent developments in neural networks, there are\nnow methods for sorting the images in a data set according to their similarity\nto a prompt in natural language. In order to further automate the provision of\nsearch results, we make a contribution by automating the threshold definition\nin these sorted results and returning only the images relevant to the prompt as\na result. Our focus is on preventing false positives and false negatives\nequally. It is also important that our method is robust and in the case that\nour assumptions are not fulfilled, we provide a fallback solution.\n","authors":["Philipp Rigoll","Laurenz Adolph","Lennart Ries","Eric Sax"],"pdf_url":"https://arxiv.org/pdf/2404.05309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05307v1","updated":"2024-04-08T08:53:54Z","published":"2024-04-08T08:53:54Z","title":"Human Detection from 4D Radar Data in Low-Visibility Field Conditions","summary":" Autonomous driving technology is increasingly being used on public roads and\nin industrial settings such as mines. While it is essential to detect\npedestrians, vehicles, or other obstacles, adverse field conditions negatively\naffect the performance of classical sensors such as cameras or lidars. Radar,\non the other hand, is a promising modality that is less affected by, e.g.,\ndust, smoke, water mist or fog. In particular, modern 4D imaging radars provide\ntarget responses across the range, vertical angle, horizontal angle and Doppler\nvelocity dimensions. We propose TMVA4D, a CNN architecture that leverages this\n4D radar modality for semantic segmentation. The CNN is trained to distinguish\nbetween the background and person classes based on a series of 2D projections\nof the 4D radar data that include the elevation, azimuth, range, and Doppler\nvelocity dimensions. We also outline the process of compiling a novel dataset\nconsisting of data collected in industrial settings with a car-mounted 4D radar\nand describe how the ground-truth labels were generated from reference thermal\nimages. Using TMVA4D on this dataset, we achieve an mIoU score of 78.2% and an\nmDice score of 86.1%, evaluated on the two classes background and person\n","authors":["Mikael Skog","Oleksandr Kotlyar","Vladimír Kubelka","Martin Magnusson"],"pdf_url":"https://arxiv.org/pdf/2404.05307v1.pdf","comment":"Submitted to Radar in Robotics workshop at ICRA 2024"},{"id":"http://arxiv.org/abs/2404.03427v2","updated":"2024-04-08T08:40:11Z","published":"2024-04-04T13:13:47Z","title":"GMMCalib: Extrinsic Calibration of LiDAR Sensors using GMM-based Joint\n Registration","summary":" State-of-the-art LiDAR calibration frameworks mainly use non-probabilistic\nregistration methods such as Iterative Closest Point (ICP) and its variants.\nThese methods suffer from biased results due to their pair-wise registration\nprocedure as well as their sensitivity to initialization and parameterization.\nThis often leads to misalignments in the calibration process. Probabilistic\nregistration methods compensate for these drawbacks by specifically modeling\nthe probabilistic nature of the observations. This paper presents GMMCalib, an\nautomatic target-based extrinsic calibration approach for multi-LiDAR systems.\nUsing an implementation of a Gaussian Mixture Model (GMM)-based registration\nmethod that allows joint registration of multiple point clouds, this\ndata-driven approach is compared to ICP algorithms. We perform simulation\nexperiments using the digital twin of the EDGAR research vehicle and validate\nthe results in a real-world environment. We also address the local minima\nproblem of local registration methods for extrinsic sensor calibration and use\na distance-based metric to evaluate the calibration results. Our results show\nthat an increase in robustness against sensor miscalibrations can be achieved\nby using GMM-based registration algorithms. The code is open source and\navailable on GitHub.\n","authors":["Ilir Tahiraj","Felix Fent","Philipp Hafemann","Egon Ye","Markus Lienkamp"],"pdf_url":"https://arxiv.org/pdf/2404.03427v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05295v1","updated":"2024-04-08T08:32:39Z","published":"2024-04-08T08:32:39Z","title":"Online Learning of Joint-Muscle Mapping Using Vision in Tendon-driven\n Musculoskeletal Humanoids","summary":" The body structures of tendon-driven musculoskeletal humanoids are complex,\nand accurate modeling is difficult, because they are made by imitating the body\nstructures of human beings. For this reason, we have not been able to move them\naccurately like ordinary humanoids driven by actuators in each axis, and large\ninternal muscle tension and slack of tendon wires have emerged by the model\nerror between its geometric model and the actual robot. Therefore, we construct\na joint-muscle mapping (JMM) using a neural network (NN), which expresses a\nnonlinear relationship between joint angles and muscle lengths, and aim to move\ntendon-driven musculoskeletal humanoids accurately by updating the JMM online\nfrom data of the actual robot. In this study, the JMM is updated online by\nusing the vision of the robot so that it moves to the correct position (Vision\nUpdater). Also, we execute another update to modify muscle antagonisms\ncorrectly (Antagonism Updater). By using these two updaters, the error between\nthe target and actual joint angles decrease to about 40% in 5 minutes, and we\nshow through a manipulation experiment that the tendon-driven musculoskeletal\nhumanoid Kengoro becomes able to move as intended. This novel system can adapt\nto the state change and growth of robots, because it updates the JMM online\nsuccessively.\n","authors":["Kento Kawaharazuka","Shogo Makino","Masaya Kawamura","Yuki Asano","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2404.05295v1.pdf","comment":"Accepted at IEEE Robotics and Automation Letters, 2018"},{"id":"http://arxiv.org/abs/2404.05293v1","updated":"2024-04-08T08:31:28Z","published":"2024-04-08T08:31:28Z","title":"Long-time Self-body Image Acquisition and its Application to the Control\n of Musculoskeletal Structures","summary":" The tendon-driven musculoskeletal humanoid has many benefits that human\nbeings have, but the modeling of its complex muscle and bone structures is\ndifficult and conventional model-based controls cannot realize intended\nmovements. Therefore, a learning control mechanism that acquires nonlinear\nrelationships between joint angles, muscle tensions, and muscle lengths from\nthe actual robot is necessary. In this study, we propose a system which runs\nthe learning control mechanism for a long time to keep the self-body image of\nthe musculoskeletal humanoid correct at all times. Also, we show that the\nmusculoskeletal humanoid can conduct position control, torque control, and\nvariable stiffness control using this self-body image. We conduct a long-time\nself-body image acquisition experiment lasting 3 hours, evaluate variable\nstiffness control using the self-body image, etc., and discuss the superiority\nand practicality of the self-body image acquisition of musculoskeletal\nstructures, comprehensively.\n","authors":["Kento Kawaharazuka","Kei Tsuzuki","Shogo Makino","Moritaka Onitsuka","Yuki Asano","Kei Okada","Koji Kawasaki","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2404.05293v1.pdf","comment":"Accepted at IEEE Robotics and Automation Letters, 2019"},{"id":"http://arxiv.org/abs/2404.05291v1","updated":"2024-04-08T08:29:00Z","published":"2024-04-08T08:29:00Z","title":"Long-horizon Locomotion and Manipulation on a Quadrupedal Robot with\n Large Language Models","summary":" We present a large language model (LLM) based system to empower quadrupedal\nrobots with problem-solving abilities for long-horizon tasks beyond short-term\nmotions. Long-horizon tasks for quadrupeds are challenging since they require\nboth a high-level understanding of the semantics of the problem for task\nplanning and a broad range of locomotion and manipulation skills to interact\nwith the environment. Our system builds a high-level reasoning layer with large\nlanguage models, which generates hybrid discrete-continuous plans as robot code\nfrom task descriptions. It comprises multiple LLM agents: a semantic planner\nfor sketching a plan, a parameter calculator for predicting arguments in the\nplan, and a code generator to convert the plan into executable robot code. At\nthe low level, we adopt reinforcement learning to train a set of motion\nplanning and control skills to unleash the flexibility of quadrupeds for rich\nenvironment interactions. Our system is tested on long-horizon tasks that are\ninfeasible to complete with one single skill. Simulation and real-world\nexperiments show that it successfully figures out multi-step strategies and\ndemonstrates non-trivial behaviors, including building tools or notifying a\nhuman for help.\n","authors":["Yutao Ouyang","Jinhan Li","Yunfei Li","Zhongyu Li","Chao Yu","Koushil Sreenath","Yi Wu"],"pdf_url":"https://arxiv.org/pdf/2404.05291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05286v1","updated":"2024-04-08T08:24:09Z","published":"2024-04-08T08:24:09Z","title":"Online Self-body Image Acquisition Considering Changes in Muscle Routes\n Caused by Softness of Body Tissue for Tendon-driven Musculoskeletal Humanoids","summary":" Tendon-driven musculoskeletal humanoids have many benefits in terms of the\nflexible spine, multiple degrees of freedom, and variable stiffness. At the\nsame time, because of its body complexity, there are problems in\ncontrollability. First, due to the large difference between the actual robot\nand its geometric model, it cannot move as intended and large internal muscle\ntension may emerge. Second, movements which do not appear as changes in muscle\nlengths may emerge, because of the muscle route changes caused by softness of\nbody tissue. To solve these problems, we construct two models: ideal\njoint-muscle model and muscle-route change model, using a neural network. We\ninitialize these models by a man-made geometric model and update them online\nusing the sensor information of the actual robot. We validate that the\ntendon-driven musculoskeletal humanoid Kengoro is able to obtain a correct\nself-body image through several experiments.\n","authors":["Kento Kawaharazuka","Shogo Makino","Masaya Kawamura","Ayaka Fujii","Yuki Asano","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2404.05286v1.pdf","comment":"Accepted at IROS2018"},{"id":"http://arxiv.org/abs/2401.08298v2","updated":"2024-04-08T08:10:17Z","published":"2024-01-16T11:50:54Z","title":"Online Elasticity Estimation and Material Sorting Using Standard Robot\n Grippers","summary":" Standard robot grippers are not designed for material recognition. We\nexperimentally evaluated the accuracy with which material properties can be\nestimated through object compression by two standard parallel jaw grippers and\na force/torque sensor mounted at the robot wrist, with a professional biaxial\ncompression device used as reference. Gripper effort versus position curves\nwere obtained and transformed into stress/strain curves. The modulus of\nelasticity was estimated at different strain points and the effect of multiple\ncompression cycles (precycling), compression speed, and the gripper surface\narea on estimation was studied. Viscoelasticity was estimated using the energy\nabsorbed in a compression/decompression cycle, the Kelvin-Voigt, and\nHunt-Crossley models. We found that: (1) slower compression speeds improved\nelasticity estimation, while precycling or surface area did not; (2) the robot\ngrippers, even after calibration, were found to have a limited capability of\ndelivering accurate estimates of absolute values of Young's modulus and\nviscoelasticity; (3) relative ordering of material characteristics was largely\nconsistent across different grippers; (4) despite the nonlinear characteristics\nof deformable objects, fitting linear stress/strain approximations led to more\nstable results than local estimates of Young's modulus; (5) the Hunt-Crossley\nmodel worked best to estimate viscoelasticity, from a single object\ncompression. A two-dimensional space formed by elasticity and viscoelasticity\nestimates obtained from a single grasp is advantageous for the discrimination\nof the object material properties. We demonstrated the applicability of our\nfindings in a mock single stream recycling scenario, where plastic, paper, and\nmetal objects were correctly separated from a single grasp, even when\ncompressed at different locations on the object. The data and code are publicly\navailable.\n","authors":["Shubhan P. Patni","Pavel Stoudek","Hynek Chlup","Matej Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2401.08298v2.pdf","comment":"22 pages, 17 figures"},{"id":"http://arxiv.org/abs/2404.05262v1","updated":"2024-04-08T07:51:20Z","published":"2024-04-08T07:51:20Z","title":"Robust Anthropomorphic Robotic Manipulation through Biomimetic\n Distributed Compliance","summary":" The impressive capabilities of humans to robustly perform manipulation relies\non compliant interactions, enabled through the structure and materials\nspatially distributed in our hands. We propose by mimicking this distributed\ncompliance in an anthropomorphic robotic hand, the open-loop manipulation\nrobustness increases and observe the emergence of human-like behaviours. To\nachieve this, we introduce the ADAPT Hand equipped with tunable compliance\nthroughout the skin, fingers, and the wrist. Through extensive automated\npick-and-place tests, we show the grasping robustness closely mirrors an\nestimated geometric theoretical limit, while `stress-testing' the robot hand to\nperform 800+ grasps. Finally, 24 items with largely varying geometries are\ngrasped in a constrained environment with a success rate of 93\\%. We\ndemonstrate the hand-object self-organization behavior underlines this extreme\nrobustness, where the hand automatically exhibits different grasp types\ndepending on object geometries. Furthermore, the robot grasp type mimics a\nnatural human grasp with a direct similarity of 68\\%.\n","authors":["Kai Junge","Josie Hughes"],"pdf_url":"https://arxiv.org/pdf/2404.05262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04071v2","updated":"2024-04-08T07:47:08Z","published":"2024-04-05T12:55:04Z","title":"High-Frequency Capacitive Sensing for Electrohydraulic Soft Actuators","summary":" The need for compliant and proprioceptive actuators has grown more evident in\npursuing more adaptable and versatile robotic systems. Hydraulically Amplified\nSelf-Healing Electrostatic (HASEL) actuators offer distinctive advantages with\ntheir inherent softness and flexibility, making them promising candidates for\nvarious robotic tasks, including delicate interactions with humans and animals,\nbiomimetic locomotion, prosthetics, and exoskeletons. This has resulted in a\ngrowing interest in the capacitive self-sensing capabilities of HASEL actuators\nto create miniature displacement estimation circuitry that does not require\nexternal sensors. However, achieving HASEL self-sensing for actuation\nfrequencies above 1 Hz and with miniature high-voltage power supplies has\nremained limited. In this paper, we introduce the F-HASEL actuator, which adds\nan additional electrode pair used exclusively for capacitive sensing to a\nPeano-HASEL actuator. We demonstrate displacement estimation of the F-HASEL\nduring high-frequency actuation up to 20 Hz and during external loading using\nminiaturized circuitry comprised of low-cost off-the-shelf components and a\nminiature high-voltage power supply. Finally, we propose a circuitry to\nestimate the displacement of multiple F-HASELs and demonstrate it in a wearable\napplication to track joint rotations of a virtual reality user in real-time.\n","authors":["Michel R. Vogt","Maximilian Eberlein","Clemens C. Christoph","Felix Baumann","Fabrice Bourquin","Wim Wende","Fabio Schaub","Amirhossein Kazemipour","Robert K. Katzschmann"],"pdf_url":"https://arxiv.org/pdf/2404.04071v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2403.18236v2","updated":"2024-04-08T07:43:30Z","published":"2024-03-27T03:53:30Z","title":"Multi-AGV Path Planning Method via Reinforcement Learning and Particle\n Filters","summary":" The Reinforcement Learning (RL) algorithm, renowned for its robust learning\ncapability and search stability, has garnered significant attention and found\nextensive application in Automated Guided Vehicle (AGV) path planning. However,\nRL planning algorithms encounter challenges stemming from the substantial\nvariance of neural networks caused by environmental instability and significant\nfluctuations in system structure. These challenges manifest in slow convergence\nspeed and low learning efficiency. To tackle this issue, this paper presents\nthe Particle Filter-Double Deep Q-Network (PF-DDQN) approach, which\nincorporates the Particle Filter (PF) into multi-AGV reinforcement learning\npath planning. The PF-DDQN method leverages the imprecise weight values of the\nnetwork as state values to formulate the state space equation. Through the\niterative fusion process of neural networks and particle filters, the DDQN\nmodel is optimized to acquire the optimal true weight values, thus enhancing\nthe algorithm's efficiency. The proposed method's effectiveness and superiority\nare validated through numerical simulations. Overall, the simulation results\ndemonstrate that the proposed algorithm surpasses the traditional DDQN\nalgorithm in terms of path planning superiority and training time indicators by\n92.62% and 76.88%, respectively. In conclusion, the PF-DDQN method addresses\nthe challenges encountered by RL planning algorithms in AGV path planning. By\nintegrating the Particle Filter and optimizing the DDQN model, the proposed\nmethod achieves enhanced efficiency and outperforms the traditional DDQN\nalgorithm in terms of path planning superiority and training time indicators.\n","authors":["Shao Shuo"],"pdf_url":"https://arxiv.org/pdf/2403.18236v2.pdf","comment":"The literature cited in the third article is not marked"},{"id":"http://arxiv.org/abs/2404.05249v1","updated":"2024-04-08T07:25:25Z","published":"2024-04-08T07:25:25Z","title":"SAFE-GIL: SAFEty Guided Imitation Learning","summary":" Behavior Cloning is a popular approach to Imitation Learning, in which a\nrobot observes an expert supervisor and learns a control policy. However,\nbehavior cloning suffers from the \"compounding error\" problem - the policy\nerrors compound as it deviates from the expert demonstrations and might lead to\ncatastrophic system failures, limiting its use in safety-critical applications.\nOn-policy data aggregation methods are able to address this issue at the cost\nof rolling out and repeated training of the imitation policy, which can be\ntedious and computationally prohibitive. We propose SAFE-GIL, an off-policy\nbehavior cloning method that guides the expert via adversarial disturbance\nduring data collection. The algorithm abstracts the imitation error as an\nadversarial disturbance in the system dynamics, injects it during data\ncollection to expose the expert to safety critical states, and collects\ncorrective actions. Our method biases training to more closely replicate expert\nbehavior in safety-critical states and allows more variance in less critical\nstates. We compare our method with several behavior cloning techniques and\nDAgger on autonomous navigation and autonomous taxiing tasks and show higher\ntask success and safety, especially in low data regimes where the likelihood of\nerror is higher, at a slight drop in the performance.\n","authors":["Yusuf Umut Ciftci","Zeyuan Feng","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2404.05249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05242v1","updated":"2024-04-08T07:11:43Z","published":"2024-04-08T07:11:43Z","title":"Collision-Free Trajectory Optimization in Cluttered Environments with\n Sums-of-Squares Programming","summary":" In this work, we propose a trajectory optimization approach for robot\nnavigation in cluttered 3D environments. We represent the robot's geometry as a\nsemialgebraic set defined by polynomial inequalities such that robots with\ngeneral shapes can be suitably characterized. To address the robot navigation\ntask in obstacle-dense environments, we exploit the free space directly to\nconstruct a sequence of free regions, and allocate each waypoint on the\ntrajectory to a specific region. Then, we incorporate a uniform scaling factor\nfor each free region, and formulate a Sums-of-Squares (SOS) optimization\nproblem that renders the containment relationship between the robot and the\nfree space computationally tractable. The SOS optimization problem is further\nreformulated to a semidefinite program (SDP), and the collision-free\nconstraints are shown to be equivalent to limiting the scaling factor along the\nentire trajectory. In this context, the robot at a specific configuration is\ntailored to stay within the free region. Next, to solve the trajectory\noptimization problem with the proposed safety constraints (which are implicitly\ndependent on the robot configurations), we derive the analytical solution to\nthe gradient of the minimum scaling factor with respect to the robot\nconfiguration. As a result, this seamlessly facilitates the use of\ngradient-based methods in efficient solving of the trajectory optimization\nproblem. Through a series of simulations and real-world experiments, the\nproposed trajectory optimization approach is validated in various challenging\nscenarios, and the results demonstrate its effectiveness in generating\ncollision-free trajectories in dense and intricate environments populated with\nobstacles.\n","authors":["Yulin Li","Chunxin Zheng","Kai Chen","Yusen Xie","Xindong Tang","Michael Yu Wang","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2404.05242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05203v1","updated":"2024-04-08T05:10:35Z","published":"2024-04-08T05:10:35Z","title":"MeSA-DRL: Memory-Enhanced Deep Reinforcement Learning for Advanced\n Socially Aware Robot Navigation in Crowded Environments","summary":" Autonomous navigation capabilities play a critical role in service robots\noperating in environments where human interactions are pivotal, due to the\ndynamic and unpredictable nature of these environments. However, the\nvariability in human behavior presents a substantial challenge for robots in\npredicting and anticipating movements, particularly in crowded scenarios. To\naddress this issue, a memory-enabled deep reinforcement learning framework is\nproposed for autonomous robot navigation in diverse pedestrian scenarios. The\nproposed framework leverages long-term memory to retain essential information\nabout the surroundings and model sequential dependencies effectively. The\nimportance of human-robot interactions is also encoded to assign higher\nattention to these interactions. A global planning mechanism is incorporated\ninto the memory-enabled architecture. Additionally, a multi-term reward system\nis designed to prioritize and encourage long-sighted robot behaviors by\nincorporating dynamic warning zones. Simultaneously, it promotes smooth\ntrajectories and minimizes the time taken to reach the robot's desired goal.\nExtensive simulation experiments show that the suggested approach outperforms\nrepresentative state-of-the-art methods, showcasing its ability to a navigation\nefficiency and safety in real-world scenarios.\n","authors":["Mannan Saeed Muhammad","Estrella Montero"],"pdf_url":"https://arxiv.org/pdf/2404.05203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05187v1","updated":"2024-04-08T04:27:36Z","published":"2024-04-08T04:27:36Z","title":"LGSDF: Continual Global Learning of Signed Distance Fields Aided by\n Local Updating","summary":" Implicit reconstruction of ESDF (Euclidean Signed Distance Field) involves\ntraining a neural network to regress the signed distance from any point to the\nnearest obstacle, which has the advantages of lightweight storage and\ncontinuous querying. However, existing algorithms usually rely on conflicting\nraw observations as training data, resulting in poor map performance. In this\npaper, we propose LGSDF, an ESDF continual Global learning algorithm aided by\nLocal updating. At the front end, axis-aligned grids are dynamically updated by\npre-processed sensor observations, where incremental fusion alleviates\nestimation error caused by limited viewing directions. At the back end, a\nrandomly initialized implicit ESDF neural network performs continual\nself-supervised learning guided by these grids to generate smooth and\ncontinuous maps. The results on multiple scenes show that LGSDF can construct\nmore accurate ESDF maps and meshes compared with SOTA (State Of The Art)\nexplicit and implicit mapping algorithms. The source code of LGSDF is publicly\navailable at https://github.com/BIT-DYN/LGSDF.\n","authors":["Yufeng Yue","Yinan Deng","Jiahui Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2404.05187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.00685v3","updated":"2024-04-08T04:06:33Z","published":"2020-11-02T02:15:20Z","title":"Fast Biconnectivity Restoration in Multi-Robot Systems for Robust\n Communication Maintenance","summary":" Maintaining a robust communication network plays an important role in the\nsuccess of a multi-robot team jointly performing an optimization task. A key\ncharacteristic of a robust multi-robot system is the ability to repair the\ncommunication topology itself in the case of robot failure. In this paper, we\nfocus on the Fast Biconnectivity Restoration (FBR) problem, which aims to\nrepair a connected network to make it biconnected as fast as possible, where a\nbiconnected network is a communication topology that cannot be disconnected by\nremoving one node. We develop a Quadratically Constrained Program (QCP)\nformulation of the FBR problem, which provides a way to optimally solve the\nproblem. We also propose an approximation algorithm for the FBR problem based\non graph theory. By conducting empirical studies, we demonstrate that our\nproposed approximation algorithm performs close to the optimal while\nsignificantly outperforming the existing solutions.\n","authors":["Md Ishat-E-Rabban","Guangyao Shi","Pratap Tokekar"],"pdf_url":"https://arxiv.org/pdf/2011.00685v3.pdf","comment":"updated author affiliation, fixed typos, added references"},{"id":"http://arxiv.org/abs/2310.07070v2","updated":"2024-04-08T03:13:39Z","published":"2023-10-10T23:34:59Z","title":"D2M2N: Decentralized Differentiable Memory-Enabled Mapping and\n Navigation for Multiple Robots","summary":" Recently, a number of learning-based models have been proposed for\nmulti-robot navigation. However, these models lack memory and only rely on the\ncurrent observations of the robot to plan their actions. They are unable to\nleverage past observations to plan better paths, especially in complex\nenvironments. In this work, we propose a fully differentiable and decentralized\nmemory-enabled architecture for multi-robot navigation and mapping called\nD2M2N. D2M2N maintains a compact representation of the environment to remember\npast observations and uses Value Iteration Network for complex navigation. We\nconduct extensive experiments to show that D2M2N significantly outperforms the\nstate-of-the-art model in complex mapping and navigation task.\n","authors":["Md Ishat-E-Rabban","Pratap Tokekar"],"pdf_url":"https://arxiv.org/pdf/2310.07070v2.pdf","comment":"7 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.05164v1","updated":"2024-04-08T03:08:59Z","published":"2024-04-08T03:08:59Z","title":"Rendering-Enhanced Automatic Image-to-Point Cloud Registration for\n Roadside Scenes","summary":" Prior point cloud provides 3D environmental context, which enhances the\ncapabilities of monocular camera in downstream vision tasks, such as 3D object\ndetection, via data fusion. However, the absence of accurate and automated\nregistration methods for estimating camera extrinsic parameters in roadside\nscene point clouds notably constrains the potential applications of roadside\ncameras. This paper proposes a novel approach for the automatic registration\nbetween prior point clouds and images from roadside scenes. The main idea\ninvolves rendering photorealistic grayscale views taken at specific\nperspectives from the prior point cloud with the help of their features like\nRGB or intensity values. These generated views can reduce the modality\ndifferences between images and prior point clouds, thereby improve the\nrobustness and accuracy of the registration results. Particularly, we specify\nan efficient algorithm, named neighbor rendering, for the rendering process.\nThen we introduce a method for automatically estimating the initial guess using\nonly rough guesses of camera's position. At last, we propose a procedure for\niteratively refining the extrinsic parameters by minimizing the reprojection\nerror for line features extracted from both generated and camera images using\nSegment Anything Model (SAM). We assess our method using a self-collected\ndataset, comprising eight cameras strategically positioned throughout the\nuniversity campus. Experiments demonstrate our method's capability to\nautomatically align prior point cloud with roadside camera image, achieving a\nrotation accuracy of 0.202 degrees and a translation precision of 0.079m.\nFurthermore, we validate our approach's effectiveness in visual applications by\nsubstantially improving monocular 3D object detection performance.\n","authors":["Yu Sheng","Lu Zhang","Xingchen Li","Yifan Duan","Yanyong Zhang","Yu Zhang","Jianmin Ji"],"pdf_url":"https://arxiv.org/pdf/2404.05164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08393v3","updated":"2024-04-08T02:57:55Z","published":"2023-11-14T18:53:28Z","title":"MVSA-Net: Multi-View State-Action Recognition for Robust and Deployable\n Trajectory Generation","summary":" The learn-from-observation (LfO) paradigm is a human-inspired mode for a\nrobot to learn to perform a task simply by watching it being performed. LfO can\nfacilitate robot integration on factory floors by minimizing disruption and\nreducing tedious programming. A key component of the LfO pipeline is a\ntransformation of the depth camera frames to the corresponding task state and\naction pairs, which are then relayed to learning techniques such as imitation\nor inverse reinforcement learning for understanding the task parameters. While\nseveral existing computer vision models analyze videos for activity\nrecognition, SA-Net specifically targets robotic LfO from RGB-D data. However,\nSA-Net and many other models analyze frame data captured from a single\nviewpoint. Their analysis is therefore highly sensitive to occlusions of the\nobserved task, which are frequent in deployments. An obvious way of reducing\nocclusions is to simultaneously observe the task from multiple viewpoints and\nsynchronously fuse the multiple streams in the model. Toward this, we present\nmulti-view SA-Net, which generalizes the SA-Net model to allow the perception\nof multiple viewpoints of the task activity, integrate them, and better\nrecognize the state and action in each frame. Performance evaluations on two\ndistinct domains establish that MVSA-Net recognizes the state-action pairs\nunder occlusion more accurately compared to single-view MVSA-Net and other\nbaselines. Our ablation studies further evaluate its performance under\ndifferent ambient conditions and establish the contribution of the architecture\ncomponents. As such, MVSA-Net offers a significantly more robust and deployable\nstate-action trajectory generation compared to previous methods.\n","authors":["Ehsan Asali","Prashant Doshi","Jin Sun"],"pdf_url":"https://arxiv.org/pdf/2311.08393v3.pdf","comment":"Presented at Deployable AI Workshop at AAAI-2024 and 'Towards\n Reliable and Deployable Learning-Based Robotic Systems' Workshop at CoRL2023"},{"id":"http://arxiv.org/abs/2403.03954v3","updated":"2024-04-08T02:46:38Z","published":"2024-03-06T18:58:49Z","title":"3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple\n 3D Representations","summary":" Imitation learning provides an efficient way to teach robots dexterous\nskills; however, learning complex skills robustly and generalizablely usually\nconsumes large amounts of human demonstrations. To tackle this challenging\nproblem, we present 3D Diffusion Policy (DP3), a novel visual imitation\nlearning approach that incorporates the power of 3D visual representations into\ndiffusion policies, a class of conditional action generative models. The core\ndesign of DP3 is the utilization of a compact 3D visual representation,\nextracted from sparse point clouds with an efficient point encoder. In our\nexperiments involving 72 simulation tasks, DP3 successfully handles most tasks\nwith just 10 demonstrations and surpasses baselines with a 24.2% relative\nimprovement. In 4 real robot tasks, DP3 demonstrates precise control with a\nhigh success rate of 85%, given only 40 demonstrations of each task, and shows\nexcellent generalization abilities in diverse aspects, including space,\nviewpoint, appearance, and instance. Interestingly, in real robot experiments,\nDP3 rarely violates safety requirements, in contrast to baseline methods which\nfrequently do, necessitating human intervention. Our extensive evaluation\nhighlights the critical importance of 3D representations in real-world robot\nlearning. Videos, code, and data are available on\nhttps://3d-diffusion-policy.github.io .\n","authors":["Yanjie Ze","Gu Zhang","Kangning Zhang","Chenyuan Hu","Muhan Wang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2403.03954v3.pdf","comment":"Videos, code, and data: https://3d-diffusion-policy.github.io"},{"id":"http://arxiv.org/abs/2404.05151v1","updated":"2024-04-08T02:25:56Z","published":"2024-04-08T02:25:56Z","title":"STITCH: Augmented Dexterity for Suture Throws Including Thread\n Coordination and Handoffs","summary":" We present STITCH: an augmented dexterity pipeline that performs Suture\nThrows Including Thread Coordination and Handoffs. STITCH iteratively performs\nneedle insertion, thread sweeping, needle extraction, suture cinching, needle\nhandover, and needle pose correction with failure recovery policies. We\nintroduce a novel visual 6D needle pose estimation framework using a stereo\ncamera pair and new suturing motion primitives. We compare STITCH to baselines,\nincluding a proprioception-only and a policy without visual servoing. In\nphysical experiments across 15 trials, STITCH achieves an average of 2.93\nsutures without human intervention and 4.47 sutures with human intervention.\nSee https://sites.google.com/berkeley.edu/stitch for code and supplemental\nmaterials.\n","authors":["Kush Hari","Hansoul Kim","Will Panitch","Kishore Srinivas","Vincent Schorp","Karthik Dharmarajan","Shreya Ganti","Tara Sadjadpour","Ken Goldberg"],"pdf_url":"https://arxiv.org/pdf/2404.05151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05139v1","updated":"2024-04-08T01:38:43Z","published":"2024-04-08T01:38:43Z","title":"Better Monocular 3D Detectors with LiDAR from the Past","summary":" Accurate 3D object detection is crucial to autonomous driving. Though\nLiDAR-based detectors have achieved impressive performance, the high cost of\nLiDAR sensors precludes their widespread adoption in affordable vehicles.\nCamera-based detectors are cheaper alternatives but often suffer inferior\nperformance compared to their LiDAR-based counterparts due to inherent depth\nambiguities in images. In this work, we seek to improve monocular 3D detectors\nby leveraging unlabeled historical LiDAR data. Specifically, at inference time,\nwe assume that the camera-based detectors have access to multiple unlabeled\nLiDAR scans from past traversals at locations of interest (potentially from\nother high-end vehicles equipped with LiDAR sensors). Under this setup, we\nproposed a novel, simple, and end-to-end trainable framework, termed\nAsyncDepth, to effectively extract relevant features from asynchronous LiDAR\ntraversals of the same location for monocular 3D detectors. We show consistent\nand significant performance gain (up to 9 AP) across multiple state-of-the-art\nmodels and datasets with a negligible additional latency of 9.66 ms and a small\nstorage cost.\n","authors":["Yurong You","Cheng Perng Phoo","Carlos Andres Diaz-Ruiz","Katie Z Luo","Wei-Lun Chao","Mark Campbell","Bharath Hariharan","Kilian Q Weinberger"],"pdf_url":"https://arxiv.org/pdf/2404.05139v1.pdf","comment":"Accepted by ICRA 2022. The code can be found at\n https://github.com/YurongYou/AsyncDepth"},{"id":"http://arxiv.org/abs/2404.05134v1","updated":"2024-04-08T01:25:38Z","published":"2024-04-08T01:25:38Z","title":"LLM-BT: Performing Robotic Adaptive Tasks based on Large Language Models\n and Behavior Trees","summary":" Large Language Models (LLMs) have been widely utilized to perform complex\nrobotic tasks. However, handling external disturbances during tasks is still an\nopen challenge. This paper proposes a novel method to achieve robotic adaptive\ntasks based on LLMs and Behavior Trees (BTs). It utilizes ChatGPT to reason the\ndescriptive steps of tasks. In order to enable ChatGPT to understand the\nenvironment, semantic maps are constructed by an object recognition algorithm.\nThen, we design a Parser module based on Bidirectional Encoder Representations\nfrom Transformers (BERT) to parse these steps into initial BTs. Subsequently, a\nBTs Update algorithm is proposed to expand the initial BTs dynamically to\ncontrol robots to perform adaptive tasks. Different from other LLM-based\nmethods for complex robotic tasks, our method outputs variable BTs that can add\nand execute new actions according to environmental changes, which is robust to\nexternal disturbances. Our method is validated with simulation in different\npractical scenarios.\n","authors":["Haotian Zhou","Yunhan Lin","Longwu Yan","Jihong Zhu","Huasong Min"],"pdf_url":"https://arxiv.org/pdf/2404.05134v1.pdf","comment":"7 pages, 11figures, WILL PUBLISHED ON ICRA 2024"},{"id":"http://arxiv.org/abs/2404.05120v1","updated":"2024-04-08T00:45:35Z","published":"2024-04-08T00:45:35Z","title":"Rollbot: a Spherical Robot Driven by a Single Actuator","summary":" Here we present Rollbot, the first spherical robot capable of controllably\nmaneuvering on 2D plane with a single actuator. Rollbot rolls on the ground in\ncircular pattern and controls its motion by changing the curvature of the\ntrajectory through accelerating and decelerating its single motor and attached\nmass. We present the theoretical analysis, design, and control of Rollbot, and\ndemonstrate its ability to move in a controllable circular pattern and follow\nwaypoints.\n","authors":["Jingxian Wang","Michael Rubenstein"],"pdf_url":"https://arxiv.org/pdf/2404.05120v1.pdf","comment":"Submission to IROS 2024"},{"id":"http://arxiv.org/abs/2404.05888v1","updated":"2024-04-08T22:01:28Z","published":"2024-04-08T22:01:28Z","title":"A Realistic Surgical Simulator for Non-Rigid and Contact-Rich\n Manipulation in Surgeries with the da Vinci Research Kit","summary":" Realistic real-time surgical simulators play an increasingly important role\nin surgical robotics research, such as surgical robot learning and automation,\nand surgical skills assessment. Although there are a number of existing\nsurgical simulators for research, they generally lack the ability to simulate\nthe diverse types of objects and contact-rich manipulation tasks typically\npresent in surgeries, such as tissue cutting and blood suction. In this work,\nwe introduce CRESSim, a realistic surgical simulator based on PhysX 5 for the\nda Vinci Research Kit (dVRK) that enables simulating various contact-rich\nsurgical tasks involving different surgical instruments, soft tissue, and body\nfluids. The real-world dVRK console and the master tool manipulator (MTM)\nrobots are incorporated into the system to allow for teleoperation through\nvirtual reality (VR). To showcase the advantages and potentials of the\nsimulator, we present three examples of surgical tasks, including tissue\ngrasping and deformation, blood suction, and tissue cutting. These tasks are\nperformed using the simulated surgical instruments, including the large needle\ndriver, suction irrigator, and curved scissor, through VR-based teleoperation.\n","authors":["Yafei Ou","Sadra Zargarzadeh","Paniz Sedighi","Mahdi Tavakoli"],"pdf_url":"https://arxiv.org/pdf/2404.05888v1.pdf","comment":"7 pages, 21st International Conference on Ubiquitous Robots (UR\n 2024), accepted"},{"id":"http://arxiv.org/abs/2404.05887v1","updated":"2024-04-08T21:58:25Z","published":"2024-04-08T21:58:25Z","title":"On the Fly Robotic-Assisted Medical Instrument Planning and Execution\n Using Mixed Reality","summary":" Robotic-assisted medical systems (RAMS) have gained significant attention for\ntheir advantages in alleviating surgeons' fatigue and improving patients'\noutcomes. These systems comprise a range of human-computer interactions,\nincluding medical scene monitoring, anatomical target planning, and robot\nmanipulation. However, despite its versatility and effectiveness, RAMS demands\nexpertise in robotics, leading to a high learning cost for the operator. In\nthis work, we introduce a novel framework using mixed reality technologies to\nease the use of RAMS. The proposed framework achieves real-time planning and\nexecution of medical instruments by providing 3D anatomical image overlay,\nhuman-robot collision detection, and robot programming interface. These\nfeatures, integrated with an easy-to-use calibration method for head-mounted\ndisplay, improve the effectiveness of human-robot interactions. To assess the\nfeasibility of the framework, two medical applications are presented in this\nwork: 1) coil placement during transcranial magnetic stimulation and 2) drill\nand injector device positioning during femoroplasty. Results from these use\ncases demonstrate its potential to extend to a wider range of medical\nscenarios.\n","authors":["Letian Ai","Yihao Liu","Mehran Armand","Amir Kheradmand","Alejandro Martin-Gomez"],"pdf_url":"https://arxiv.org/pdf/2404.05887v1.pdf","comment":"This paper has been accepted to IEEE ICRA 2024 as a contributed paper"},{"id":"http://arxiv.org/abs/2404.05884v1","updated":"2024-04-08T21:48:36Z","published":"2024-04-08T21:48:36Z","title":"GBEC: Geometry-Based Hand-Eye Calibration","summary":" Hand-eye calibration is the problem of solving the transformation from the\nend-effector of a robot to the sensor attached to it. Commonly employed\ntechniques, such as AXXB or AXZB formulations, rely on regression methods that\nrequire collecting pose data from different robot configurations, which can\nproduce low accuracy and repeatability. However, the derived transformation\nshould solely depend on the geometry of the end-effector and the sensor\nattachment. We propose Geometry-Based End-Effector Calibration (GBEC) that\nenhances the repeatability and accuracy of the derived transformation compared\nto traditional hand-eye calibrations. To demonstrate improvements, we apply the\napproach to two different robot-assisted procedures: Transcranial Magnetic\nStimulation (TMS) and femoroplasty. We also discuss the generalizability of\nGBEC for camera-in-hand and marker-in-hand sensor mounting methods. In the\nexperiments, we perform GBEC between the robot end-effector and an optical\ntracker's rigid body marker attached to the TMS coil or femoroplasty drill\nguide. Previous research documents low repeatability and accuracy of the\nconventional methods for robot-assisted TMS hand-eye calibration. When compared\nto some existing methods, the proposed method relies solely on the geometry of\nthe flange and the pose of the rigid-body marker, making it independent of\nworkspace constraints or robot accuracy, without sacrificing the orthogonality\nof the rotation matrix. Our results validate the accuracy and applicability of\nthe approach, providing a new and generalizable methodology for obtaining the\ntransformation from the end-effector to a sensor.\n","authors":["Yihao Liu","Jiaming Zhang","Zhangcong She","Amir Kheradmand","Mehran Armand"],"pdf_url":"https://arxiv.org/pdf/2404.05884v1.pdf","comment":"This paper has been accepted to IEEE ICRA 2024 as a contributed paper"},{"id":"http://arxiv.org/abs/2303.04857v3","updated":"2024-04-08T21:11:25Z","published":"2023-03-08T19:48:43Z","title":"Breaking Symmetries Leads to Diverse Quadrupedal Gaits","summary":" Symmetry manifests itself in legged locomotion in a variety of ways. No\nmatter where a legged system begins to move periodically, the torso and limbs\ncoordinate with each other's movements in a similar manner. Also, in many gaits\nobserved in nature, the legs on both sides of the torso move in exactly the\nsame way, sometimes they are just half a period out of phase. Furthermore, when\nsome animals move forward and backward, their movements are strikingly similar\nas if the time had been reversed. This work aims to generalize these phenomena\nand propose formal definitions of symmetries in legged locomotion using group\ntheory terminology. Symmetries in some common quadrupedal gaits such as\npronking, bounding, half-bounding, and galloping have been discussed. Moreover,\na spring-mass model has been used to demonstrate how breaking symmetries can\nalter gaits in a legged system. Studying the symmetries may provide insight\ninto which gaits may be suitable for a particular robotic design, or may enable\nroboticists to design more agile and efficient robot controllers by using\ncertain gaits.\n","authors":["Jiayu Ding","Zhenyu Gan"],"pdf_url":"https://arxiv.org/pdf/2303.04857v3.pdf","comment":"Please refer to the published version to cite this paper"},{"id":"http://arxiv.org/abs/2404.05870v1","updated":"2024-04-08T21:08:13Z","published":"2024-04-08T21:08:13Z","title":"CoBT: Collaborative Programming of Behaviour Trees from One\n Demonstration for Robot Manipulation","summary":" Mass customization and shorter manufacturing cycles are becoming more\nimportant among small and medium-sized companies. However, classical industrial\nrobots struggle to cope with product variation and dynamic environments. In\nthis paper, we present CoBT, a collaborative programming by demonstration\nframework for generating reactive and modular behavior trees. CoBT relies on a\nsingle demonstration and a combination of data-driven machine learning methods\nwith logic-based declarative learning to learn a task, thus eliminating the\nneed for programming expertise or long development times. The proposed\nframework is experimentally validated on 7 manipulation tasks and we show that\nCoBT achieves approx. 93% success rate overall with an average of 7.5s\nprogramming time. We conduct a pilot study with non-expert users to provide\nfeedback regarding the usability of CoBT.\n","authors":["Aayush Jain","Philip Long","Valeria Villani","John D. Kelleher","Maria Chiara Leva"],"pdf_url":"https://arxiv.org/pdf/2404.05870v1.pdf","comment":"Accepted for presentation at IEEE ICRA 2024"},{"id":"http://arxiv.org/abs/2311.13081v2","updated":"2024-04-08T21:04:51Z","published":"2023-11-22T01:06:45Z","title":"Learning to Fly in Seconds","summary":" Learning-based methods, particularly Reinforcement Learning (RL), hold great\npromise for streamlining deployment, enhancing performance, and achieving\ngeneralization in the control of autonomous multirotor aerial vehicles. Deep RL\nhas been able to control complex systems with impressive fidelity and agility\nin simulation but the simulation-to-reality transfer often brings a\nhard-to-bridge reality gap. Moreover, RL is commonly plagued by prohibitively\nlong training times. In this work, we propose a novel asymmetric\nactor-critic-based architecture coupled with a highly reliable RL-based\ntraining paradigm for end-to-end quadrotor control. We show how curriculum\nlearning and a highly optimized simulator enhance sample complexity and lead to\nfast training times. To precisely discuss the challenges related to\nlow-level/end-to-end multirotor control, we also introduce a taxonomy that\nclassifies the existing levels of control abstractions as well as\nnon-linearities and domain parameters. Our framework enables\nSimulation-to-Reality (Sim2Real) transfer for direct RPM control after only 18\nseconds of training on a consumer-grade laptop as well as its deployment on\nmicrocontrollers to control a multirotor under real-time guarantees. Finally,\nour solution exhibits competitive performance in trajectory tracking, as\ndemonstrated through various experimental comparisons with existing\nstate-of-the-art control solutions using a real Crazyflie nano quadrotor. We\nopen source the code including a very fast multirotor dynamics simulator that\ncan simulate about 5 months of flight per second on a laptop GPU. The fast\ntraining times and deployment to a cheap, off-the-shelf quadrotor lower the\nbarriers to entry and help democratize the research and development of these\nsystems.\n","authors":["Jonas Eschmann","Dario Albani","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2311.13081v2.pdf","comment":"Accepted for publication in IEEE Robotics and Automation Letters\n (RA-L)"},{"id":"http://arxiv.org/abs/2404.05858v1","updated":"2024-04-08T20:42:10Z","published":"2024-04-08T20:42:10Z","title":"A Neuromorphic Approach to Obstacle Avoidance in Robot Manipulation","summary":" Neuromorphic computing mimics computational principles of the brain in\n$\\textit{silico}$ and motivates research into event-based vision and spiking\nneural networks (SNNs). Event cameras (ECs) exclusively capture local intensity\nchanges and offer superior power consumption, response latencies, and dynamic\nranges. SNNs replicate biological neuronal dynamics and have demonstrated\npotential as alternatives to conventional artificial neural networks (ANNs),\nsuch as in reducing energy expenditure and inference time in visual\nclassification. Nevertheless, these novel paradigms remain scarcely explored\noutside the domain of aerial robots.\n To investigate the utility of brain-inspired sensing and data processing, we\ndeveloped a neuromorphic approach to obstacle avoidance on a camera-equipped\nmanipulator. Our approach adapts high-level trajectory plans with reactive\nmaneuvers by processing emulated event data in a convolutional SNN, decoding\nneural activations into avoidance motions, and adjusting plans using a dynamic\nmotion primitive. We conducted experiments with a Kinova Gen3 arm performing\nsimple reaching tasks that involve obstacles in sets of distinct task scenarios\nand in comparison to a non-adaptive baseline.\n Our neuromorphic approach facilitated reliable avoidance of imminent\ncollisions in simulated and real-world experiments, where the baseline\nconsistently failed. Trajectory adaptations had low impacts on safety and\npredictability criteria. Among the notable SNN properties were the correlation\nof computations with the magnitude of perceived motions and a robustness to\ndifferent event emulation methods. Tests with a DAVIS346 EC showed similar\nperformance, validating our experimental event emulation. Our results motivate\nincorporating SNN learning, utilizing neuromorphic processors, and further\nexploring the potential of neuromorphic methods.\n","authors":["Ahmed Faisal Abdelrahman","Matias Valdenegro-Toro","Maren Bennewitz","Paul G. Plöger"],"pdf_url":"https://arxiv.org/pdf/2404.05858v1.pdf","comment":"35 pages, accepted at IJRR, authors' version"},{"id":"http://arxiv.org/abs/2302.14445v2","updated":"2024-04-08T17:15:58Z","published":"2023-02-28T09:45:51Z","title":"Embedded light-weight approach for safe landing in populated areas","summary":" Landing safety is a challenge heavily engaging the research community\nrecently, due to the increasing interest in applications availed by aerial\nvehicles. In this paper, we propose a landing safety pipeline based on state of\nthe art object detectors and OctoMap. First, a point cloud of surface obstacles\nis generated, which is then inserted in an OctoMap. The unoccupied areas are\nidentified, thus resulting to a list of safe landing points. Due to the low\ninference time achieved by state of the art object detectors and the efficient\npoint cloud manipulation using OctoMap, it is feasible for our approach to\ndeploy on low-weight embedded systems. The proposed pipeline has been evaluated\nin many simulation scenarios, varying in people density, number, and movement.\nSimulations were executed with an Nvidia Jetson Nano in the loop to confirm the\npipeline's performance and robustness in a low computing power hardware. The\nexperiments yielded promising results with a 95% success rate.\n","authors":["Tilemahos Mitroudas","Vasiliki Balaska","Athanasios Psomoulis","Antonios Gasteratos"],"pdf_url":"https://arxiv.org/pdf/2302.14445v2.pdf","comment":"outdated research item"},{"id":"http://arxiv.org/abs/2404.05223v1","updated":"2024-04-08T06:36:42Z","published":"2024-04-08T06:36:42Z","title":"ITA-ECBS: A Bounded-Suboptimal Algorithm for Combined Target-Assignment\n and Path-Finding Problem","summary":" Multi-Agent Path Finding (MAPF), i.e., finding collision-free paths for\nmultiple robots, plays a critical role in many applications. Sometimes,\nassigning a specific target to each agent also presents a challenge. The\nCombined Target-Assignment and Path-Finding (TAPF) problem, a variant of MAPF,\nrequires simultaneously assigning targets to agents and planning collision-free\npaths. Several algorithms, including CBM, CBS-TA, and ITA-CBS, can optimally\nsolve the TAPF problem, with ITA-CBS being the leading method of flowtime.\nHowever, the only existing suboptimal method ECBS-TA, is derived from CBS-TA\nrather than ITA-CBS, and adapting the optimal ITA-CBS method to its\nbounded-suboptimal variant is a challenge due to the variability of target\nassignment solutions in different search nodes. We introduce ITA-ECBS as the\nfirst bounded-suboptimal variant of ITA-CBS. ITA-ECBS employs focal search to\nenhance efficiency and determines target assignments based on a new lower bound\nmatrix. We show that ITA-ECBS outperforms the baseline method ECBS-TA in 87.42%\nof 54,033 test cases.\n","authors":["Yimin Tang","Sven Koenig","Jiaoyang Li"],"pdf_url":"https://arxiv.org/pdf/2404.05223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05338v1","updated":"2024-04-08T09:26:31Z","published":"2024-04-08T09:26:31Z","title":"GPS-free Autonomous Navigation in Cluttered Tree Rows with Deep Semantic\n Segmentation","summary":" Segmentation-based autonomous navigation has recently been presented as an\nappealing approach to guiding robotic platforms through crop rows without\nrequiring perfect GPS localization. Nevertheless, current techniques are\nrestricted to situations where the distinct separation between the plants and\nthe sky allows for the identification of the row's center. However, tall, dense\nvegetation, such as high tree rows and orchards, is the primary cause of GPS\nsignal blockage. In this study, we increase the overall robustness and\nadaptability of the control algorithm by extending the segmentation-based\nrobotic guiding to those cases where canopies and branches occlude the sky and\nprevent the utilization of GPS and earlier approaches. An efficient Deep Neural\nNetwork architecture has been used to address semantic segmentation, performing\nthe training with synthetic data only. Numerous vineyards and tree fields have\nundergone extensive testing in both simulation and real-world to show the\nsolution's competitive benefits.\n","authors":["Alessandro Navone","Mauro Martini","Marco Ambrosio","Andrea Ostuni","Simone Angarano","Marcello Chiaberge"],"pdf_url":"https://arxiv.org/pdf/2404.05338v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.08988"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.05729v1","updated":"2024-04-08T17:59:46Z","published":"2024-04-08T17:59:46Z","title":"Finding Visual Task Vectors","summary":" Visual Prompting is a technique for teaching models to perform a visual task\nvia in-context examples, without any additional training. In this work, we\nanalyze the activations of MAE-VQGAN, a recent Visual Prompting model, and find\ntask vectors, activations that encode task-specific information. Equipped with\nthis insight, we demonstrate that it is possible to identify the task vectors\nand use them to guide the network towards performing different tasks without\nproviding any input-output examples. To find task vectors, we compute the\naverage intermediate activations per task and use the REINFORCE algorithm to\nsearch for the subset of task vectors. The resulting task vectors guide the\nmodel towards performing a task better than the original model without the need\nfor input-output examples.\n","authors":["Alberto Hojel","Yutong Bai","Trevor Darrell","Amir Globerson","Amir Bar"],"pdf_url":"https://arxiv.org/pdf/2404.05729v1.pdf","comment":"https://github.com/alhojel/visual_task_vectors"},{"id":"http://arxiv.org/abs/2404.05726v1","updated":"2024-04-08T17:59:24Z","published":"2024-04-08T17:59:24Z","title":"MA-LMM: Memory-Augmented Large Multimodal Model for Long-Term Video\n Understanding","summary":" With the success of large language models (LLMs), integrating the vision\nmodel into LLMs to build vision-language foundation models has gained much more\ninterest recently. However, existing LLM-based large multimodal models (e.g.,\nVideo-LLaMA, VideoChat) can only take in a limited number of frames for short\nvideo understanding. In this study, we mainly focus on designing an efficient\nand effective model for long-term video understanding. Instead of trying to\nprocess more frames simultaneously like most existing work, we propose to\nprocess videos in an online manner and store past video information in a memory\nbank. This allows our model to reference historical video content for long-term\nanalysis without exceeding LLMs' context length constraints or GPU memory\nlimits. Our memory bank can be seamlessly integrated into current multimodal\nLLMs in an off-the-shelf manner. We conduct extensive experiments on various\nvideo understanding tasks, such as long-video understanding, video question\nanswering, and video captioning, and our model can achieve state-of-the-art\nperformances across multiple datasets. Code available at\nhttps://boheumd.github.io/MA-LMM/.\n","authors":["Bo He","Hengduo Li","Young Kyun Jang","Menglin Jia","Xuefei Cao","Ashish Shah","Abhinav Shrivastava","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2404.05726v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05719v1","updated":"2024-04-08T17:55:44Z","published":"2024-04-08T17:55:44Z","title":"Ferret-UI: Grounded Mobile UI Understanding with Multimodal LLMs","summary":" Recent advancements in multimodal large language models (MLLMs) have been\nnoteworthy, yet, these general-domain MLLMs often fall short in their ability\nto comprehend and interact effectively with user interface (UI) screens. In\nthis paper, we present Ferret-UI, a new MLLM tailored for enhanced\nunderstanding of mobile UI screens, equipped with referring, grounding, and\nreasoning capabilities. Given that UI screens typically exhibit a more\nelongated aspect ratio and contain smaller objects of interest (e.g., icons,\ntexts) than natural images, we incorporate \"any resolution\" on top of Ferret to\nmagnify details and leverage enhanced visual features. Specifically, each\nscreen is divided into 2 sub-images based on the original aspect ratio (i.e.,\nhorizontal division for portrait screens and vertical division for landscape\nscreens). Both sub-images are encoded separately before being sent to LLMs. We\nmeticulously gather training samples from an extensive range of elementary UI\ntasks, such as icon recognition, find text, and widget listing. These samples\nare formatted for instruction-following with region annotations to facilitate\nprecise referring and grounding. To augment the model's reasoning ability, we\nfurther compile a dataset for advanced tasks, including detailed description,\nperception/interaction conversations, and function inference. After training on\nthe curated datasets, Ferret-UI exhibits outstanding comprehension of UI\nscreens and the capability to execute open-ended instructions. For model\nevaluation, we establish a comprehensive benchmark encompassing all the\naforementioned tasks. Ferret-UI excels not only beyond most open-source UI\nMLLMs, but also surpasses GPT-4V on all the elementary UI tasks.\n","authors":["Keen You","Haotian Zhang","Eldon Schoop","Floris Weers","Amanda Swearngin","Jeffrey Nichols","Yinfei Yang","Zhe Gan"],"pdf_url":"https://arxiv.org/pdf/2404.05719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05717v1","updated":"2024-04-08T17:52:29Z","published":"2024-04-08T17:52:29Z","title":"SwapAnything: Enabling Arbitrary Object Swapping in Personalized Visual\n Editing","summary":" Effective editing of personal content holds a pivotal role in enabling\nindividuals to express their creativity, weaving captivating narratives within\ntheir visual stories, and elevate the overall quality and impact of their\nvisual content. Therefore, in this work, we introduce SwapAnything, a novel\nframework that can swap any objects in an image with personalized concepts\ngiven by the reference, while keeping the context unchanged. Compared with\nexisting methods for personalized subject swapping, SwapAnything has three\nunique advantages: (1) precise control of arbitrary objects and parts rather\nthan the main subject, (2) more faithful preservation of context pixels, (3)\nbetter adaptation of the personalized concept to the image. First, we propose\ntargeted variable swapping to apply region control over latent feature maps and\nswap masked variables for faithful context preservation and initial semantic\nconcept swapping. Then, we introduce appearance adaptation, to seamlessly adapt\nthe semantic concept into the original image in terms of target location,\nshape, style, and content during the image generation process. Extensive\nresults on both human and automatic evaluation demonstrate significant\nimprovements of our approach over baseline methods on personalized swapping.\nFurthermore, SwapAnything shows its precise and faithful swapping abilities\nacross single object, multiple objects, partial object, and cross-domain\nswapping tasks. SwapAnything also achieves great performance on text-based\nswapping and tasks beyond swapping such as object insertion.\n","authors":["Jing Gu","Yilin Wang","Nanxuan Zhao","Wei Xiong","Qing Liu","Zhifei Zhang","He Zhang","Jianming Zhang","HyunJoon Jung","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05717v1.pdf","comment":"18 pages, 16 figures, 3 tables"},{"id":"http://arxiv.org/abs/2311.04071v4","updated":"2024-04-08T17:49:58Z","published":"2023-11-07T15:35:56Z","title":"Energy-Calibrated VAE with Test Time Free Lunch","summary":" In this paper, we propose a novel generative model that utilizes a\nconditional Energy-Based Model (EBM) for enhancing Variational Autoencoder\n(VAE), termed Energy-Calibrated VAE (EC-VAE). Specifically, VAEs often suffer\nfrom blurry generated samples due to the lack of a tailored training on the\nsamples generated in the generative direction. On the other hand, EBMs can\ngenerate high-quality samples but require expensive Markov Chain Monte Carlo\n(MCMC) sampling. To address these issues, we introduce a conditional EBM for\ncalibrating the generative direction of VAE during training, without requiring\nit for the generation at test time. In particular, we train EC-VAE upon both\nthe input data and the calibrated samples with adaptive weight to enhance\nefficacy while avoiding MCMC sampling at test time. Furthermore, we extend the\ncalibration idea of EC-VAE to variational learning and normalizing flows, and\napply EC-VAE to an additional application of zero-shot image restoration via\nneural transport prior and range-null theory. We evaluate the proposed method\nwith two applications, including image generation and zero-shot image\nrestoration, and the experimental results show that our method achieves\ncompetitive performance over single-step non-adversarial generation. Our code\nis available at https://github.com/DJ-LYH/EC-VAE.\n","authors":["Yihong Luo","Siya Qiu","Xingjian Tao","Yujun Cai","Jing Tang"],"pdf_url":"https://arxiv.org/pdf/2311.04071v4.pdf","comment":"Revision. Code is available at https://github.com/DJ-LYH/EC-VAE"},{"id":"http://arxiv.org/abs/2404.05705v1","updated":"2024-04-08T17:42:08Z","published":"2024-04-08T17:42:08Z","title":"Learning 3D-Aware GANs from Unposed Images with Template Feature Field","summary":" Collecting accurate camera poses of training images has been shown to well\nserve the learning of 3D-aware generative adversarial networks (GANs) yet can\nbe quite expensive in practice. This work targets learning 3D-aware GANs from\nunposed images, for which we propose to perform on-the-fly pose estimation of\ntraining images with a learned template feature field (TeFF). Concretely, in\naddition to a generative radiance field as in previous approaches, we ask the\ngenerator to also learn a field from 2D semantic features while sharing the\ndensity from the radiance field. Such a framework allows us to acquire a\ncanonical 3D feature template leveraging the dataset mean discovered by the\ngenerative model, and further efficiently estimate the pose parameters on real\ndata. Experimental results on various challenging datasets demonstrate the\nsuperiority of our approach over state-of-the-art alternatives from both the\nqualitative and the quantitative perspectives.\n","authors":["Xinya Chen","Hanlei Guo","Yanrui Bin","Shangzhan Zhang","Yuanbo Yang","Yue Wang","Yujun Shen","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2404.05705v1.pdf","comment":"https://XDimlab.github.io/TeFF"},{"id":"http://arxiv.org/abs/2404.05693v1","updated":"2024-04-08T17:18:30Z","published":"2024-04-08T17:18:30Z","title":"Evaluating the Efficacy of Cut-and-Paste Data Augmentation in Semantic\n Segmentation for Satellite Imagery","summary":" Satellite imagery is crucial for tasks like environmental monitoring and\nurban planning. Typically, it relies on semantic segmentation or Land Use Land\nCover (LULC) classification to categorize each pixel. Despite the advancements\nbrought about by Deep Neural Networks (DNNs), their performance in segmentation\ntasks is hindered by challenges such as limited availability of labeled data,\nclass imbalance and the inherent variability and complexity of satellite\nimages. In order to mitigate those issues, our study explores the effectiveness\nof a Cut-and-Paste augmentation technique for semantic segmentation in\nsatellite images. We adapt this augmentation, which usually requires labeled\ninstances, to the case of semantic segmentation. By leveraging the connected\ncomponents in the semantic segmentation labels, we extract instances that are\nthen randomly pasted during training. Using the DynamicEarthNet dataset and a\nU-Net model for evaluation, we found that this augmentation significantly\nenhances the mIoU score on the test set from 37.9 to 44.1. This finding\nhighlights the potential of the Cut-and-Paste augmentation to improve the\ngeneralization capabilities of semantic segmentation models in satellite\nimagery.\n","authors":["Ionut M. Motoi","Leonardo Saraceni","Daniele Nardi","Thomas A. Ciarfuglia"],"pdf_url":"https://arxiv.org/pdf/2404.05693v1.pdf","comment":"Accepted for publication in IEEE 2024 International Geoscience &\n Remote Sensing Symposium (IGARSS 2024)"},{"id":"http://arxiv.org/abs/2404.05687v1","updated":"2024-04-08T17:10:45Z","published":"2024-04-08T17:10:45Z","title":"Retrieval-Augmented Open-Vocabulary Object Detection","summary":" Open-vocabulary object detection (OVD) has been studied with Vision-Language\nModels (VLMs) to detect novel objects beyond the pre-trained categories.\nPrevious approaches improve the generalization ability to expand the knowledge\nof the detector, using 'positive' pseudo-labels with additional 'class' names,\ne.g., sock, iPod, and alligator. To extend the previous methods in two aspects,\nwe propose Retrieval-Augmented Losses and visual Features (RALF). Our method\nretrieves related 'negative' classes and augments loss functions. Also, visual\nfeatures are augmented with 'verbalized concepts' of classes, e.g., worn on the\nfeet, handheld music player, and sharp teeth. Specifically, RALF consists of\ntwo modules: Retrieval Augmented Losses (RAL) and Retrieval-Augmented visual\nFeatures (RAF). RAL constitutes two losses reflecting the semantic similarity\nwith negative vocabularies. In addition, RAF augments visual features with the\nverbalized concepts from a large language model (LLM). Our experiments\ndemonstrate the effectiveness of RALF on COCO and LVIS benchmark datasets. We\nachieve improvement up to 3.4 box AP$_{50}^{\\text{N}}$ on novel categories of\nthe COCO dataset and 3.6 mask AP$_{\\text{r}}$ gains on the LVIS dataset. Code\nis available at https://github.com/mlvlab/RALF .\n","authors":["Jooyeon Kim","Eulrang Cho","Sehyung Kim","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2404.05687v1.pdf","comment":"Accepted paper at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05680v1","updated":"2024-04-08T16:58:31Z","published":"2024-04-08T16:58:31Z","title":"SphereHead: Stable 3D Full-head Synthesis with Spherical Tri-plane\n Representation","summary":" While recent advances in 3D-aware Generative Adversarial Networks (GANs) have\naided the development of near-frontal view human face synthesis, the challenge\nof comprehensively synthesizing a full 3D head viewable from all angles still\npersists. Although PanoHead proves the possibilities of using a large-scale\ndataset with images of both frontal and back views for full-head synthesis, it\noften causes artifacts for back views. Based on our in-depth analysis, we found\nthe reasons are mainly twofold. First, from network architecture perspective,\nwe found each plane in the utilized tri-plane/tri-grid representation space\ntends to confuse the features from both sides, causing \"mirroring\" artifacts\n(e.g., the glasses appear in the back). Second, from data supervision aspect,\nwe found that existing discriminator training in 3D GANs mainly focuses on the\nquality of the rendered image itself, and does not care much about its\nplausibility with the perspective from which it was rendered. This makes it\npossible to generate \"face\" in non-frontal views, due to its easiness to fool\nthe discriminator. In response, we propose SphereHead, a novel tri-plane\nrepresentation in the spherical coordinate system that fits the human head's\ngeometric characteristics and efficiently mitigates many of the generated\nartifacts. We further introduce a view-image consistency loss for the\ndiscriminator to emphasize the correspondence of the camera parameters and the\nimages. The combination of these efforts results in visually superior outcomes\nwith significantly fewer artifacts. Our code and dataset are publicly available\nat https://lhyfst.github.io/spherehead.\n","authors":["Heyuan Li","Ce Chen","Tianhao Shi","Yuda Qiu","Sizhe An","Guanying Chen","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2404.05680v1.pdf","comment":"project page: https://lhyfst.github.io/spherehead"},{"id":"http://arxiv.org/abs/2312.07425v2","updated":"2024-04-08T16:56:17Z","published":"2023-12-12T16:48:53Z","title":"Deep Internal Learning: Deep Learning from a Single Input","summary":" Deep learning, in general, focuses on training a neural network from large\nlabeled datasets. Yet, in many cases there is value in training a network just\nfrom the input at hand. This is particularly relevant in many signal and image\nprocessing problems where training data is scarce and diversity is large on the\none hand, and on the other, there is a lot of structure in the data that can be\nexploited. Using this information is the key to deep internal-learning\nstrategies, which may involve training a network from scratch using a single\ninput or adapting an already trained network to a provided input example at\ninference time. This survey paper aims at covering deep internal-learning\ntechniques that have been proposed in the past few years for these two\nimportant directions. While our main focus will be on image processing\nproblems, most of the approaches that we survey are derived for general signals\n(vectors with recurring patterns that can be distinguished from noise) and are\ntherefore applicable to other modalities.\n","authors":["Tom Tirer","Raja Giryes","Se Young Chun","Yonina C. Eldar"],"pdf_url":"https://arxiv.org/pdf/2312.07425v2.pdf","comment":"Accepted to IEEE Signal Processing Magazine"},{"id":"http://arxiv.org/abs/2404.05675v1","updated":"2024-04-08T16:56:05Z","published":"2024-04-08T16:56:05Z","title":"Normalizing Flows on the Product Space of SO(3) Manifolds for\n Probabilistic Human Pose Modeling","summary":" Normalizing flows have proven their efficacy for density estimation in\nEuclidean space, but their application to rotational representations, crucial\nin various domains such as robotics or human pose modeling, remains\nunderexplored. Probabilistic models of the human pose can benefit from\napproaches that rigorously consider the rotational nature of human joints. For\nthis purpose, we introduce HuProSO3, a normalizing flow model that operates on\na high-dimensional product space of SO(3) manifolds, modeling the joint\ndistribution for human joints with three degrees of freedom. HuProSO3's\nadvantage over state-of-the-art approaches is demonstrated through its superior\nmodeling accuracy in three different applications and its capability to\nevaluate the exact likelihood. This work not only addresses the technical\nchallenge of learning densities on SO(3) manifolds, but it also has broader\nimplications for domains where the probabilistic regression of correlated 3D\nrotations is of importance.\n","authors":["Olaf Dünkel","Tim Salzmann","Florian Pfaff"],"pdf_url":"https://arxiv.org/pdf/2404.05675v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05674v1","updated":"2024-04-08T16:55:49Z","published":"2024-04-08T16:55:49Z","title":"MoMA: Multimodal LLM Adapter for Fast Personalized Image Generation","summary":" In this paper, we present MoMA: an open-vocabulary, training-free\npersonalized image model that boasts flexible zero-shot capabilities. As\nfoundational text-to-image models rapidly evolve, the demand for robust\nimage-to-image translation grows. Addressing this need, MoMA specializes in\nsubject-driven personalized image generation. Utilizing an open-source,\nMultimodal Large Language Model (MLLM), we train MoMA to serve a dual role as\nboth a feature extractor and a generator. This approach effectively synergizes\nreference image and text prompt information to produce valuable image features,\nfacilitating an image diffusion model. To better leverage the generated\nfeatures, we further introduce a novel self-attention shortcut method that\nefficiently transfers image features to an image diffusion model, improving the\nresemblance of the target object in generated images. Remarkably, as a\ntuning-free plug-and-play module, our model requires only a single reference\nimage and outperforms existing methods in generating images with high detail\nfidelity, enhanced identity-preservation and prompt faithfulness. Our work is\nopen-source, thereby providing universal access to these advancements.\n","authors":["Kunpeng Song","Yizhe Zhu","Bingchen Liu","Qing Yan","Ahmed Elgammal","Xiao Yang"],"pdf_url":"https://arxiv.org/pdf/2404.05674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05673v1","updated":"2024-04-08T16:55:39Z","published":"2024-04-08T16:55:39Z","title":"CoReS: Orchestrating the Dance of Reasoning and Segmentation","summary":" The reasoning segmentation task, which demands a nuanced comprehension of\nintricate queries to accurately pinpoint object regions, is attracting\nincreasing attention. However, Multi-modal Large Language Models (MLLM) often\nfind it difficult to accurately localize the objects described in complex\nreasoning contexts. We believe that the act of reasoning segmentation should\nmirror the cognitive stages of human visual search, where each step is a\nprogressive refinement of thought toward the final object. Thus we introduce\nthe Chains of Reasoning and Segmenting (CoReS) and find this top-down visual\nhierarchy indeed enhances the visual search process. Specifically, we propose a\ndual-chain structure that generates multi-modal, chain-like outputs to aid the\nsegmentation process. Furthermore, to steer the MLLM's outputs into this\nintended hierarchy, we incorporate in-context inputs as guidance. Extensive\nexperiments demonstrate the superior performance of our CoReS, which surpasses\nthe state-of-the-art method by 7.1\\% on the ReasonSeg dataset. The code will be\nreleased at https://github.com/baoxiaoyi/CoReS.\n","authors":["Xiaoyi Bao","Siyang Sun","Shuailei Ma","Kecheng Zheng","Yuxin Guo","Guosheng Zhao","Yun Zheng","Xingang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05669v1","updated":"2024-04-08T16:52:21Z","published":"2024-04-08T16:52:21Z","title":"NAF-DPM: A Nonlinear Activation-Free Diffusion Probabilistic Model for\n Document Enhancement","summary":" Real-world documents may suffer various forms of degradation, often resulting\nin lower accuracy in optical character recognition (OCR) systems. Therefore, a\ncrucial preprocessing step is essential to eliminate noise while preserving\ntext and key features of documents. In this paper, we propose NAF-DPM, a novel\ngenerative framework based on a diffusion probabilistic model (DPM) designed to\nrestore the original quality of degraded documents. While DPMs are recognized\nfor their high-quality generated images, they are also known for their large\ninference time. To mitigate this problem we provide the DPM with an efficient\nnonlinear activation-free (NAF) network and we employ as a sampler a fast\nsolver of ordinary differential equations, which can converge in a few\niterations. To better preserve text characters, we introduce an additional\ndifferentiable module based on convolutional recurrent neural networks,\nsimulating the behavior of an OCR system during training. Experiments conducted\non various datasets showcase the superiority of our approach, achieving\nstate-of-the-art performance in terms of pixel-level and perceptual similarity\nmetrics. Furthermore, the results demonstrate a notable character error\nreduction made by OCR systems when transcribing real-world document images\nenhanced by our framework. Code and pre-trained models are available at\nhttps://github.com/ispamm/NAF-DPM.\n","authors":["Giordano Cicchetti","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2404.05669v1.pdf","comment":"Under review at IEEE Transactions on Pattern Analysis and Machine\n Intelligence"},{"id":"http://arxiv.org/abs/2404.05667v1","updated":"2024-04-08T16:51:33Z","published":"2024-04-08T16:51:33Z","title":"AlignZeg: Mitigating Objective Misalignment for Zero-shot Semantic\n Segmentation","summary":" A serious issue that harms the performance of zero-shot visual recognition is\nnamed objective misalignment, i.e., the learning objective prioritizes\nimproving the recognition accuracy of seen classes rather than unseen classes,\nwhile the latter is the true target to pursue. This issue becomes more\nsignificant in zero-shot image segmentation because the stronger (i.e.,\npixel-level) supervision brings a larger gap between seen and unseen classes.\nTo mitigate it, we propose a novel architecture named AlignZeg, which embodies\na comprehensive improvement of the segmentation pipeline, including proposal\nextraction, classification, and correction, to better fit the goal of zero-shot\nsegmentation. (1) Mutually-Refined Proposal Extraction. AlignZeg harnesses a\nmutual interaction between mask queries and visual features, facilitating\ndetailed class-agnostic mask proposal extraction. (2) Generalization-Enhanced\nProposal Classification. AlignZeg introduces synthetic data and incorporates\nmultiple background prototypes to allocate a more generalizable feature space.\n(3) Predictive Bias Correction. During the inference stage, AlignZeg uses a\nclass indicator to find potential unseen class proposals followed by a\nprediction postprocess to correct the prediction bias. Experiments demonstrate\nthat AlignZeg markedly enhances zero-shot semantic segmentation, as shown by an\naverage 3.8% increase in hIoU, primarily attributed to a 7.1% improvement in\nidentifying unseen classes, and we further validate that the improvement comes\nfrom alleviating the objective misalignment issue.\n","authors":["Jiannan Ge","Lingxi Xie","Hongtao Xie","Pandeng Li","Xiaopeng Zhang","Yongdong Zhang","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05666v1","updated":"2024-04-08T16:51:19Z","published":"2024-04-08T16:51:19Z","title":"YaART: Yet Another ART Rendering Technology","summary":" In the rapidly progressing field of generative models, the development of\nefficient and high-fidelity text-to-image diffusion systems represents a\nsignificant frontier. This study introduces YaART, a novel production-grade\ntext-to-image cascaded diffusion model aligned to human preferences using\nReinforcement Learning from Human Feedback (RLHF). During the development of\nYaART, we especially focus on the choices of the model and training dataset\nsizes, the aspects that were not systematically investigated for text-to-image\ncascaded diffusion models before. In particular, we comprehensively analyze how\nthese choices affect both the efficiency of the training process and the\nquality of the generated images, which are highly important in practice.\nFurthermore, we demonstrate that models trained on smaller datasets of\nhigher-quality images can successfully compete with those trained on larger\ndatasets, establishing a more efficient scenario of diffusion models training.\nFrom the quality perspective, YaART is consistently preferred by users over\nmany existing state-of-the-art models.\n","authors":["Sergey Kastryulin","Artem Konev","Alexander Shishenya","Eugene Lyapustin","Artem Khurshudov","Alexander Tselousov","Nikita Vinokurov","Denis Kuznedelev","Alexander Markovich","Grigoriy Livshits","Alexey Kirillov","Anastasiia Tabisheva","Liubov Chubarova","Marina Kaminskaia","Alexander Ustyuzhanin","Artemii Shvetsov","Daniil Shlenskii","Valerii Startsev","Dmitrii Kornilov","Mikhail Romanov","Artem Babenko","Sergei Ovcharenko","Valentin Khrulkov"],"pdf_url":"https://arxiv.org/pdf/2404.05666v1.pdf","comment":"Prompts and additional information are available on the project page,\n see https://ya.ru/ai/art/paper-yaart-v1"},{"id":"http://arxiv.org/abs/2404.05662v1","updated":"2024-04-08T16:46:25Z","published":"2024-04-08T16:46:25Z","title":"BinaryDM: Towards Accurate Binarization of Diffusion Model","summary":" With the advancement of diffusion models (DMs) and the substantially\nincreased computational requirements, quantization emerges as a practical\nsolution to obtain compact and efficient low-bit DMs. However, the highly\ndiscrete representation leads to severe accuracy degradation, hindering the\nquantization of diffusion models to ultra-low bit-widths. In this paper, we\npropose BinaryDM, a novel accurate quantization-aware training approach to push\nthe weights of diffusion models towards the limit of 1-bit. Firstly, we present\na Learnable Multi-basis Binarizer (LMB) to recover the representations\ngenerated by the binarized DM, which improves the information in details of\nrepresentations crucial to the DM. Secondly, a Low-rank Representation\nMimicking (LRM) is applied to enhance the binarization-aware optimization of\nthe DM, alleviating the optimization direction ambiguity caused by fine-grained\nalignment. Moreover, a progressive initialization strategy is applied to\ntraining DMs to avoid convergence difficulties. Comprehensive experiments\ndemonstrate that BinaryDM achieves significant accuracy and efficiency gains\ncompared to SOTA quantization methods of DMs under ultra-low bit-widths. As the\nfirst binarization method for diffusion models, BinaryDM achieves impressive\n16.0 times FLOPs and 27.1 times storage savings with 1-bit weight and 4-bit\nactivation, showcasing its substantial advantages and potential for deploying\nDMs on resource-limited scenarios.\n","authors":["Xingyu Zheng","Haotong Qin","Xudong Ma","Mingyuan Zhang","Haojie Hao","Jiakai Wang","Zixiang Zhao","Jinyang Guo","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05662v1.pdf","comment":"The code will soon be available at\n https://github.com/Xingyu-Zheng/BinaryDM"},{"id":"http://arxiv.org/abs/2404.05661v1","updated":"2024-04-08T16:46:07Z","published":"2024-04-08T16:46:07Z","title":"Automatic Controllable Colorization via Imagination","summary":" We propose a framework for automatic colorization that allows for iterative\nediting and modifications. The core of our framework lies in an imagination\nmodule: by understanding the content within a grayscale image, we utilize a\npre-trained image generation model to generate multiple images that contain the\nsame content. These images serve as references for coloring, mimicking the\nprocess of human experts. As the synthesized images can be imperfect or\ndifferent from the original grayscale image, we propose a Reference Refinement\nModule to select the optimal reference composition. Unlike most previous\nend-to-end automatic colorization algorithms, our framework allows for\niterative and localized modifications of the colorization results because we\nexplicitly model the coloring samples. Extensive experiments demonstrate the\nsuperiority of our framework over existing automatic colorization algorithms in\neditability and flexibility. Project page:\nhttps://xy-cong.github.io/imagine-colorization.\n","authors":["Xiaoyan Cong","Yue Wu","Qifeng Chen","Chenyang Lei"],"pdf_url":"https://arxiv.org/pdf/2404.05661v1.pdf","comment":"CVPR 2024. Project page:\n https://xy-cong.github.io/imagine-colorization"},{"id":"http://arxiv.org/abs/2404.05657v1","updated":"2024-04-08T16:40:15Z","published":"2024-04-08T16:40:15Z","title":"MLP Can Be A Good Transformer Learner","summary":" Self-attention mechanism is the key of the Transformer but often criticized\nfor its computation demands. Previous token pruning works motivate their\nmethods from the view of computation redundancy but still need to load the full\nnetwork and require same memory costs. This paper introduces a novel strategy\nthat simplifies vision transformers and reduces computational load through the\nselective removal of non-essential attention layers, guided by entropy\nconsiderations. We identify that regarding the attention layer in bottom\nblocks, their subsequent MLP layers, i.e. two feed-forward layers, can elicit\nthe same entropy quantity. Meanwhile, the accompanied MLPs are under-exploited\nsince they exhibit smaller feature entropy compared to those MLPs in the top\nblocks. Therefore, we propose to integrate the uninformative attention layers\ninto their subsequent counterparts by degenerating them into identical mapping,\nyielding only MLP in certain transformer blocks. Experimental results on\nImageNet-1k show that the proposed method can remove 40% attention layer of\nDeiT-B, improving throughput and memory bound without performance compromise.\nCode is available at https://github.com/sihaoevery/lambda_vit.\n","authors":["Sihao Lin","Pumeng Lyu","Dongrui Liu","Tao Tang","Xiaodan Liang","Andy Song","Xiaojun Chang"],"pdf_url":"https://arxiv.org/pdf/2404.05657v1.pdf","comment":"efficient transformer"},{"id":"http://arxiv.org/abs/2404.05641v1","updated":"2024-04-08T16:21:22Z","published":"2024-04-08T16:21:22Z","title":"3D-COCO: extension of MS-COCO dataset for image detection and 3D\n reconstruction modules","summary":" We introduce 3D-COCO, an extension of the original MS-COCO dataset providing\n3D models and 2D-3D alignment annotations. 3D-COCO was designed to achieve\ncomputer vision tasks such as 3D reconstruction or image detection configurable\nwith textual, 2D image, and 3D CAD model queries. We complete the existing\nMS-COCO dataset with 28K 3D models collected on ShapeNet and Objaverse. By\nusing an IoU-based method, we match each MS-COCO annotation with the best 3D\nmodels to provide a 2D-3D alignment. The open-source nature of 3D-COCO is a\npremiere that should pave the way for new research on 3D-related topics. The\ndataset and its source codes is available at\nhttps://kalisteo.cea.fr/index.php/coco3d-object-detection-and-reconstruction/\n","authors":["Maxence Bideaux","Alice Phe","Mohamed Chaouch","Bertrand Luvison","Quoc-Cuong Pham"],"pdf_url":"https://arxiv.org/pdf/2404.05641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06908v2","updated":"2024-04-08T16:16:56Z","published":"2024-03-11T17:00:27Z","title":"FreGS: 3D Gaussian Splatting with Progressive Frequency Regularization","summary":" 3D Gaussian splatting has achieved very impressive performance in real-time\nnovel view synthesis. However, it often suffers from over-reconstruction during\nGaussian densification where high-variance image regions are covered by a few\nlarge Gaussians only, leading to blur and artifacts in the rendered images. We\ndesign a progressive frequency regularization (FreGS) technique to tackle the\nover-reconstruction issue within the frequency space. Specifically, FreGS\nperforms coarse-to-fine Gaussian densification by exploiting low-to-high\nfrequency components that can be easily extracted with low-pass and high-pass\nfilters in the Fourier space. By minimizing the discrepancy between the\nfrequency spectrum of the rendered image and the corresponding ground truth, it\nachieves high-quality Gaussian densification and alleviates the\nover-reconstruction of Gaussian splatting effectively. Experiments over\nmultiple widely adopted benchmarks (e.g., Mip-NeRF360, Tanks-and-Temples and\nDeep Blending) show that FreGS achieves superior novel view synthesis and\noutperforms the state-of-the-art consistently.\n","authors":["Jiahui Zhang","Fangneng Zhan","Muyu Xu","Shijian Lu","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2403.06908v2.pdf","comment":"Accepted by CVPR 2024. Project website:\n https://rogeraigc.github.io/FreGS-Page/"},{"id":"http://arxiv.org/abs/2403.15238v2","updated":"2024-04-08T16:14:45Z","published":"2024-03-22T14:32:02Z","title":"WEEP: A method for spatial interpretation of weakly supervised CNN\n models in computational pathology","summary":" Deep learning enables the modelling of high-resolution histopathology\nwhole-slide images (WSI). Weakly supervised learning of tile-level data is\ntypically applied for tasks where labels only exist on the patient or WSI level\n(e.g. patient outcomes or histological grading). In this context, there is a\nneed for improved spatial interpretability of predictions from such models. We\npropose a novel method, Wsi rEgion sElection aPproach (WEEP), for model\ninterpretation. It provides a principled yet straightforward way to establish\nthe spatial area of WSI required for assigning a particular prediction label.\nWe demonstrate WEEP on a binary classification task in the area of breast\ncancer computational pathology. WEEP is easy to implement, is directly\nconnected to the model-based decision process, and offers information relevant\nto both research and diagnostic applications.\n","authors":["Abhinav Sharma","Bojing Liu","Mattias Rantalainen"],"pdf_url":"https://arxiv.org/pdf/2403.15238v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05626v1","updated":"2024-04-08T15:59:29Z","published":"2024-04-08T15:59:29Z","title":"Learning a Category-level Object Pose Estimator without Pose Annotations","summary":" 3D object pose estimation is a challenging task. Previous works always\nrequire thousands of object images with annotated poses for learning the 3D\npose correspondence, which is laborious and time-consuming for labeling. In\nthis paper, we propose to learn a category-level 3D object pose estimator\nwithout pose annotations. Instead of using manually annotated images, we\nleverage diffusion models (e.g., Zero-1-to-3) to generate a set of images under\ncontrolled pose differences and propose to learn our object pose estimator with\nthose images. Directly using the original diffusion model leads to images with\nnoisy poses and artifacts. To tackle this issue, firstly, we exploit an image\nencoder, which is learned from a specially designed contrastive pose learning,\nto filter the unreasonable details and extract image feature maps.\nAdditionally, we propose a novel learning strategy that allows the model to\nlearn object poses from those generated image sets without knowing the\nalignment of their canonical poses. Experimental results show that our method\nhas the capability of category-level object pose estimation from a single shot\nsetting (as pose definition), while significantly outperforming other\nstate-of-the-art methods on the few-shot category-level object pose estimation\nbenchmarks.\n","authors":["Fengrui Tian","Yaoyao Liu","Adam Kortylewski","Yueqi Duan","Shaoyi Du","Alan Yuille","Angtian Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05621v1","updated":"2024-04-08T15:51:21Z","published":"2024-04-08T15:51:21Z","title":"MULTIFLOW: Shifting Towards Task-Agnostic Vision-Language Pruning","summary":" While excellent in transfer learning, Vision-Language models (VLMs) come with\nhigh computational costs due to their large number of parameters. To address\nthis issue, removing parameters via model pruning is a viable solution.\nHowever, existing techniques for VLMs are task-specific, and thus require\npruning the network from scratch for each new task of interest. In this work,\nwe explore a new direction: Task-Agnostic Vision-Language Pruning (TA-VLP).\nGiven a pretrained VLM, the goal is to find a unique pruned counterpart\ntransferable to multiple unknown downstream tasks. In this challenging setting,\nthe transferable representations already encoded in the pretrained model are a\nkey aspect to preserve. Thus, we propose Multimodal Flow Pruning (MULTIFLOW), a\nfirst, gradient-free, pruning framework for TA-VLP where: (i) the importance of\na parameter is expressed in terms of its magnitude and its information flow, by\nincorporating the saliency of the neurons it connects; and (ii) pruning is\ndriven by the emergent (multimodal) distribution of the VLM parameters after\npretraining. We benchmark eight state-of-the-art pruning algorithms in the\ncontext of TA-VLP, experimenting with two VLMs, three vision-language tasks,\nand three pruning ratios. Our experimental results show that MULTIFLOW\noutperforms recent sophisticated, combinatorial competitors in the vast\nmajority of the cases, paving the way towards addressing TA-VLP. The code is\npublicly available at https://github.com/FarinaMatteo/multiflow.\n","authors":["Matteo Farina","Massimiliano Mancini","Elia Cunegatti","Gaowen Liu","Giovanni Iacca","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2404.05621v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2302.08274v3","updated":"2024-04-08T15:48:50Z","published":"2023-02-16T13:06:39Z","title":"Robust Human Motion Forecasting using Transformer-based Model","summary":" Comprehending human motion is a fundamental challenge for developing\nHuman-Robot Collaborative applications. Computer vision researchers have\naddressed this field by only focusing on reducing error in predictions, but not\ntaking into account the requirements to facilitate its implementation in\nrobots. In this paper, we propose a new model based on Transformer that\nsimultaneously deals with the real time 3D human motion forecasting in the\nshort and long term. Our 2-Channel Transformer (2CH-TR) is able to efficiently\nexploit the spatio-temporal information of a shortly observed sequence (400ms)\nand generates a competitive accuracy against the current state-of-the-art.\n2CH-TR stands out for the efficient performance of the Transformer, being\nlighter and faster than its competitors. In addition, our model is tested in\nconditions where the human motion is severely occluded, demonstrating its\nrobustness in reconstructing and predicting 3D human motion in a highly noisy\nenvironment. Our experiment results show that the proposed 2CH-TR outperforms\nthe ST-Transformer, which is another state-of-the-art model based on the\nTransformer, in terms of reconstruction and prediction under the same\nconditions of input prefix. Our model reduces in 8.89% the mean squared error\nof ST-Transformer in short-term prediction, and 2.57% in long-term prediction\nin Human3.6M dataset with 400ms input prefix. Webpage:\nhttps://evm7.github.io/2CHTR-page/\n","authors":["Esteve Valls Mascaro","Shuo Ma","Hyemin Ahn","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2302.08274v3.pdf","comment":"Accepted to IROS2022. Webpage: https://evm7.github.io/2CHTR-page/"},{"id":"http://arxiv.org/abs/2308.07301v2","updated":"2024-04-08T15:47:20Z","published":"2023-08-14T17:39:44Z","title":"A Unified Masked Autoencoder with Patchified Skeletons for Motion\n Synthesis","summary":" The synthesis of human motion has traditionally been addressed through\ntask-dependent models that focus on specific challenges, such as predicting\nfuture motions or filling in intermediate poses conditioned on known key-poses.\nIn this paper, we present a novel task-independent model called UNIMASK-M,\nwhich can effectively address these challenges using a unified architecture.\nOur model obtains comparable or better performance than the state-of-the-art in\neach field. Inspired by Vision Transformers (ViTs), our UNIMASK-M model\ndecomposes a human pose into body parts to leverage the spatio-temporal\nrelationships existing in human motion. Moreover, we reformulate various\npose-conditioned motion synthesis tasks as a reconstruction problem with\ndifferent masking patterns given as input. By explicitly informing our model\nabout the masked joints, our UNIMASK-M becomes more robust to occlusions.\nExperimental results show that our model successfully forecasts human motion on\nthe Human3.6M dataset. Moreover, it achieves state-of-the-art results in motion\ninbetweening on the LaFAN1 dataset, particularly in long transition periods.\nMore information can be found on the project website\nhttps://evm7.github.io/UNIMASKM-page/\n","authors":["Esteve Valls Mascaro","Hyemin Ahn","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2308.07301v2.pdf","comment":"Accepted to AAAI2024. Webpage: https://evm7.github.io/UNIMASKM-page/"},{"id":"http://arxiv.org/abs/2309.16524v2","updated":"2024-04-08T15:46:09Z","published":"2023-09-28T15:34:49Z","title":"HOI4ABOT: Human-Object Interaction Anticipation for Human Intention\n Reading Collaborative roBOTs","summary":" Robots are becoming increasingly integrated into our lives, assisting us in\nvarious tasks. To ensure effective collaboration between humans and robots, it\nis essential that they understand our intentions and anticipate our actions. In\nthis paper, we propose a Human-Object Interaction (HOI) anticipation framework\nfor collaborative robots. We propose an efficient and robust transformer-based\nmodel to detect and anticipate HOIs from videos. This enhanced anticipation\nempowers robots to proactively assist humans, resulting in more efficient and\nintuitive collaborations. Our model outperforms state-of-the-art results in HOI\ndetection and anticipation in VidHOI dataset with an increase of 1.76% and\n1.04% in mAP respectively while being 15.4 times faster. We showcase the\neffectiveness of our approach through experimental results in a real robot,\ndemonstrating that the robot's ability to anticipate HOIs is key for better\nHuman-Robot Interaction. More information can be found on our project webpage:\nhttps://evm7.github.io/HOI4ABOT_page/\n","authors":["Esteve Valls Mascaro","Daniel Sliwowski","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2309.16524v2.pdf","comment":"Proceedings in Conference on Robot Learning 2023. Webpage:\n https://evm7.github.io/HOI4ABOT_page/"},{"id":"http://arxiv.org/abs/2402.04768v2","updated":"2024-04-08T15:43:14Z","published":"2024-02-07T11:37:14Z","title":"Robot Interaction Behavior Generation based on Social Motion Forecasting\n for Human-Robot Interaction","summary":" Integrating robots into populated environments is a complex challenge that\nrequires an understanding of human social dynamics. In this work, we propose to\nmodel social motion forecasting in a shared human-robot representation space,\nwhich facilitates us to synthesize robot motions that interact with humans in\nsocial scenarios despite not observing any robot in the motion training. We\ndevelop a transformer-based architecture called ECHO, which operates in the\naforementioned shared space to predict the future motions of the agents\nencountered in social scenarios. Contrary to prior works, we reformulate the\nsocial motion problem as the refinement of the predicted individual motions\nbased on the surrounding agents, which facilitates the training while allowing\nfor single-motion forecasting when only one human is in the scene. We evaluate\nour model in multi-person and human-robot motion forecasting tasks and obtain\nstate-of-the-art performance by a large margin while being efficient and\nperforming in real-time. Additionally, our qualitative results showcase the\neffectiveness of our approach in generating human-robot interaction behaviors\nthat can be controlled via text commands. Webpage: https://evm7.github.io/ECHO/\n","authors":["Esteve Valls Mascaro","Yashuai Yan","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2402.04768v2.pdf","comment":"Accepted at ICRA 2024. Webpage: https://evm7.github.io/ECHO/"},{"id":"http://arxiv.org/abs/2404.05607v1","updated":"2024-04-08T15:29:46Z","published":"2024-04-08T15:29:46Z","title":"A Training-Free Plug-and-Play Watermark Framework for Stable Diffusion","summary":" Nowadays, the family of Stable Diffusion (SD) models has gained prominence\nfor its high quality outputs and scalability. This has also raised security\nconcerns on social media, as malicious users can create and disseminate harmful\ncontent. Existing approaches involve training components or entire SDs to embed\na watermark in generated images for traceability and responsibility\nattribution. However, in the era of AI-generated content (AIGC), the rapid\niteration of SDs renders retraining with watermark models costly. To address\nthis, we propose a training-free plug-and-play watermark framework for SDs.\nWithout modifying any components of SDs, we embed diverse watermarks in the\nlatent space, adapting to the denoising process. Our experimental findings\nreveal that our method effectively harmonizes image quality and watermark\ninvisibility. Furthermore, it performs robustly under various attacks. We also\nhave validated that our method is generalized to multiple versions of SDs, even\nwithout retraining the watermark model.\n","authors":["Guokai Zhang","Lanjun Wang","Yuting Su","An-An Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05606v1","updated":"2024-04-08T15:25:50Z","published":"2024-04-08T15:25:50Z","title":"Learning Topology Uniformed Face Mesh by Volume Rendering for Multi-view\n Reconstruction","summary":" Face meshes in consistent topology serve as the foundation for many\nface-related applications, such as 3DMM constrained face reconstruction and\nexpression retargeting. Traditional methods commonly acquire topology uniformed\nface meshes by two separate steps: multi-view stereo (MVS) to reconstruct\nshapes followed by non-rigid registration to align topology, but struggles with\nhandling noise and non-lambertian surfaces. Recently neural volume rendering\ntechniques have been rapidly evolved and shown great advantages in 3D\nreconstruction or novel view synthesis. Our goal is to leverage the superiority\nof neural volume rendering into multi-view reconstruction of face mesh with\nconsistent topology. We propose a mesh volume rendering method that enables\ndirectly optimizing mesh geometry while preserving topology, and learning\nimplicit features to model complex facial appearance from multi-view images.\nThe key innovation lies in spreading sparse mesh features into the surrounding\nspace to simulate radiance field required for volume rendering, which\nfacilitates backpropagation of gradients from images to mesh geometry and\nimplicit appearance features. Our proposed feature spreading module exhibits\ndeformation invariance, enabling photorealistic rendering seamlessly after mesh\nediting. We conduct experiments on multi-view face image dataset to evaluate\nthe reconstruction and implement an application for photorealistic rendering of\nanimated face mesh.\n","authors":["Yating Wang","Ran Yi","Ke Fan","Jinkun Hao","Jiangbo Lu","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2404.05606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05603v1","updated":"2024-04-08T15:22:38Z","published":"2024-04-08T15:22:38Z","title":"Self-Explainable Affordance Learning with Embodied Caption","summary":" In the field of visual affordance learning, previous methods mainly used\nabundant images or videos that delineate human behavior patterns to identify\naction possibility regions for object manipulation, with a variety of\napplications in robotic tasks. However, they encounter a main challenge of\naction ambiguity, illustrated by the vagueness like whether to beat or carry a\ndrum, and the complexities involved in processing intricate scenes. Moreover,\nit is important for human intervention to rectify robot errors in time. To\naddress these issues, we introduce Self-Explainable Affordance learning (SEA)\nwith embodied caption. This innovation enables robots to articulate their\nintentions and bridge the gap between explainable vision-language caption and\nvisual affordance learning. Due to a lack of appropriate dataset, we unveil a\npioneering dataset and metrics tailored for this task, which integrates images,\nheatmaps, and embodied captions. Furthermore, we propose a novel model to\neffectively combine affordance grounding with self-explanation in a simple but\nefficient manner. Extensive quantitative and qualitative experiments\ndemonstrate our method's effectiveness.\n","authors":["Zhipeng Zhang","Zhimin Wei","Guolei Sun","Peng Wang","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.05603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00722v3","updated":"2024-04-08T15:15:56Z","published":"2024-03-31T15:34:45Z","title":"DRCT: Saving Image Super-resolution away from Information Bottleneck","summary":" In recent years, Vision Transformer-based applications to low-level vision\ntasks have achieved widespread success. Unlike CNN-based models, Transformers\nare more adept at capturing long-range dependencies, enabling the\nreconstruction of images utilizing information from non-local areas. In the\ndomain of super-resolution, Swin-transformer-based approaches have become\nmainstream due to their capacity to capture global spatial information and\ntheir shifting-window attention mechanism that facilitates the interchange of\ninformation between different windows. Many researchers have enhanced image\nquality and network efficiency by expanding the receptive field or designing\ncomplex networks, yielding commendable results. However, we observed that\nspatial information tends to diminish during the forward propagation process\ndue to increased depth, leading to a loss of spatial information and,\nconsequently, limiting the model's potential. To address this, we propose the\nDense-residual-connected Transformer (DRCT), aimed at mitigating the loss of\nspatial information through dense-residual connections between layers, thereby\nunleashing the model's potential and enhancing performance. Experiment results\nindicate that our approach is not only straightforward but also achieves\nremarkable efficiency, surpassing state-of-the-art methods and performing\ncommendably at NTIRE2024.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Yi-Shiuan Chou"],"pdf_url":"https://arxiv.org/pdf/2404.00722v3.pdf","comment":"NTIRE 2024 Image Super-resolution (x4)"},{"id":"http://arxiv.org/abs/2404.05595v1","updated":"2024-04-08T15:14:20Z","published":"2024-04-08T15:14:20Z","title":"UniFL: Improve Stable Diffusion via Unified Feedback Learning","summary":" Diffusion models have revolutionized the field of image generation, leading\nto the proliferation of high-quality models and diverse downstream\napplications. However, despite these significant advancements, the current\ncompetitive solutions still suffer from several limitations, including inferior\nvisual quality, a lack of aesthetic appeal, and inefficient inference, without\na comprehensive solution in sight. To address these challenges, we present\nUniFL, a unified framework that leverages feedback learning to enhance\ndiffusion models comprehensively. UniFL stands out as a universal, effective,\nand generalizable solution applicable to various diffusion models, such as\nSD1.5 and SDXL. Notably, UniFL incorporates three key components: perceptual\nfeedback learning, which enhances visual quality; decoupled feedback learning,\nwhich improves aesthetic appeal; and adversarial feedback learning, which\noptimizes inference speed. In-depth experiments and extensive user studies\nvalidate the superior performance of our proposed method in enhancing both the\nquality of generated models and their acceleration. For instance, UniFL\nsurpasses ImageReward by 17% user preference in terms of generation quality and\noutperforms LCM and SDXL Turbo by 57% and 20% in 4-step inference. Moreover, we\nhave verified the efficacy of our approach in downstream tasks, including Lora,\nControlNet, and AnimateDiff.\n","authors":["Jiacheng Zhang","Jie Wu","Yuxi Ren","Xin Xia","Huafeng Kuang","Pan Xie","Jiashi Li","Xuefeng Xiao","Weilin Huang","Min Zheng","Lean Fu","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2404.05595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05584v1","updated":"2024-04-08T14:59:53Z","published":"2024-04-08T14:59:53Z","title":"Neural Cellular Automata for Lightweight, Robust and Explainable\n Classification of White Blood Cell Images","summary":" Diagnosis of hematological malignancies depends on accurate identification of\nwhite blood cells in peripheral blood smears. Deep learning techniques are\nemerging as a viable solution to scale and optimize this process by automatic\nidentification of cells in laboratories. However, these techniques face several\nchallenges such as limited generalizability, sensitivity to domain shifts and\nlack of explainability. Here, we are introducing a novel approach based on\nneural cellular automata (NCA) for white blood cell classification. We test our\napproach on three datasets of white blood cell images and show that we achieve\ncompetitive performance compared to conventional methods. Our NCA-based method\nis significantly smaller in terms of parameters and exhibits robustness to\ndomain shifts. Furthermore, the architecture is inherently explainable,\nproviding insights into the decision process for each classification, helping\nexperts understand and validate model predictions. Results demonstrate that NCA\nnot only can be used for image classification, but also address key challenges\nof conventional methods, indicating a high potential for applicability in\nclinical practice.\n","authors":["Michael Deutges","Ario Sadafi","Nassir Navab","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2404.05584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05583v1","updated":"2024-04-08T14:58:52Z","published":"2024-04-08T14:58:52Z","title":"Towards More General Video-based Deepfake Detection through Facial\n Feature Guided Adaptation for Foundation Model","summary":" With the rise of deep learning, generative models have enabled the creation\nof highly realistic synthetic images, presenting challenges due to their\npotential misuse. While research in Deepfake detection has grown rapidly in\nresponse, many detection methods struggle with unseen Deepfakes generated by\nnew synthesis techniques. To address this generalisation challenge, we propose\na novel Deepfake detection approach by adapting rich information encoded inside\nthe Foundation Models with rich information encoded inside, specifically using\nthe image encoder from CLIP which has demonstrated strong zero-shot capability\nfor downstream tasks. Inspired by the recent advances of parameter efficient\nfine-tuning, we propose a novel side-network-based decoder to extract spatial\nand temporal cues from the given video clip, with the promotion of the Facial\nComponent Guidance (FCG) to guidencourage the spatial feature to include\nfeatures of key facial parts for more robust and general Deepfake detection.\nThrough extensive cross-dataset evaluations, our approach exhibits superior\neffectiveness in identifying unseen Deepfake samples, achieving notable\nperformance improvementsuccess even with limited training samples and\nmanipulation types. Our model secures an average performance enhancement of\n0.9% AUROC in cross-dataset assessments comparing with state-of-the-art\nmethods, especiallytablishing a significant lead of achieving 4.4% improvement\non the challenging DFDC dataset.\n","authors":["Yue-Hua Han","Tai-Ming Huang","Shu-Tzu Lo","Po-Han Huang","Kai-Lung Hua","Jun-Cheng Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05580v1","updated":"2024-04-08T14:56:26Z","published":"2024-04-08T14:56:26Z","title":"Responsible Visual Editing","summary":" With recent advancements in visual synthesis, there is a growing risk of\nencountering images with detrimental effects, such as hate, discrimination, or\nprivacy violations. The research on transforming harmful images into\nresponsible ones remains unexplored. In this paper, we formulate a new task,\nresponsible visual editing, which entails modifying specific concepts within an\nimage to render it more responsible while minimizing changes. However, the\nconcept that needs to be edited is often abstract, making it challenging to\nlocate what needs to be modified and plan how to modify it. To tackle these\nchallenges, we propose a Cognitive Editor (CoEditor) that harnesses the large\nmultimodal model through a two-stage cognitive process: (1) a perceptual\ncognitive process to focus on what needs to be modified and (2) a behavioral\ncognitive process to strategize how to modify. To mitigate the negative\nimplications of harmful images on research, we create a transparent and public\ndataset, AltBear, which expresses harmful information using teddy bears instead\nof humans. Experiments demonstrate that CoEditor can effectively comprehend\nabstract concepts within complex scenes and significantly surpass the\nperformance of baseline models for responsible visual editing. We find that the\nAltBear dataset corresponds well to the harmful content found in real images,\noffering a consistent experimental evaluation, thereby providing a safer\nbenchmark for future research. Moreover, CoEditor also shows great results in\ngeneral editing. We release our code and dataset at\nhttps://github.com/kodenii/Responsible-Visual-Editing.\n","authors":["Minheng Ni","Yeli Shen","Lei Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.05580v1.pdf","comment":"24 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.05579v1","updated":"2024-04-08T14:55:35Z","published":"2024-04-08T14:55:35Z","title":"Robust Data Pruning: Uncovering and Overcoming Implicit Bias","summary":" In the era of exceptionally data-hungry models, careful selection of the\ntraining data is essential to mitigate the extensive costs of deep learning.\nData pruning offers a solution by removing redundant or uninformative samples\nfrom the dataset, which yields faster convergence and improved neural scaling\nlaws. However, little is known about its impact on classification bias of the\ntrained models. We conduct the first systematic study of this effect and reveal\nthat existing data pruning algorithms can produce highly biased classifiers. At\nthe same time, we argue that random data pruning with appropriate class ratios\nhas potential to improve the worst-class performance. We propose a\n\"fairness-aware\" approach to pruning and empirically demonstrate its\nperformance on standard computer vision benchmarks. In sharp contrast to\nexisting algorithms, our proposed method continues improving robustness at a\ntolerable drop of average performance as we prune more from the datasets. We\npresent theoretical analysis of the classification risk in a mixture of\nGaussians to further motivate our algorithm and support our findings.\n","authors":["Artem Vysogorets","Kartik Ahuja","Julia Kempe"],"pdf_url":"https://arxiv.org/pdf/2404.05579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05578v1","updated":"2024-04-08T14:54:54Z","published":"2024-04-08T14:54:54Z","title":"Social-MAE: Social Masked Autoencoder for Multi-person Motion\n Representation Learning","summary":" For a complete comprehension of multi-person scenes, it is essential to go\nbeyond basic tasks like detection and tracking. Higher-level tasks, such as\nunderstanding the interactions and social activities among individuals, are\nalso crucial. Progress towards models that can fully understand scenes\ninvolving multiple people is hindered by a lack of sufficient annotated data\nfor such high-level tasks. To address this challenge, we introduce Social-MAE,\na simple yet effective transformer-based masked autoencoder framework for\nmulti-person human motion data. The framework uses masked modeling to pre-train\nthe encoder to reconstruct masked human joint trajectories, enabling it to\nlearn generalizable and data efficient representations of motion in human\ncrowded scenes. Social-MAE comprises a transformer as the MAE encoder and a\nlighter-weight transformer as the MAE decoder which operates on multi-person\njoints' trajectory in the frequency domain. After the reconstruction task, the\nMAE decoder is replaced with a task-specific decoder and the model is\nfine-tuned end-to-end for a variety of high-level social tasks. Our proposed\nmodel combined with our pre-training approach achieves the state-of-the-art\nresults on various high-level social tasks, including multi-person pose\nforecasting, social grouping, and social action understanding. These\nimprovements are demonstrated across four popular multi-person datasets\nencompassing both human 2D and 3D body pose.\n","authors":["Mahsa Ehsanpour","Ian Reid","Hamid Rezatofighi"],"pdf_url":"https://arxiv.org/pdf/2404.05578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16741v2","updated":"2024-04-08T14:42:15Z","published":"2024-01-30T04:39:32Z","title":"MESA: Matching Everything by Segmenting Anything","summary":" Feature matching is a crucial task in the field of computer vision, which\ninvolves finding correspondences between images. Previous studies achieve\nremarkable performance using learning-based feature comparison. However, the\npervasive presence of matching redundancy between images gives rise to\nunnecessary and error-prone computations in these methods, imposing limitations\non their accuracy. To address this issue, we propose MESA, a novel approach to\nestablish precise area (or region) matches for efficient matching redundancy\nreduction. MESA first leverages the advanced image understanding capability of\nSAM, a state-of-the-art foundation model for image segmentation, to obtain\nimage areas with implicit semantic. Then, a multi-relational graph is proposed\nto model the spatial structure of these areas and construct their scale\nhierarchy. Based on graphical models derived from the graph, the area matching\nis reformulated as an energy minimization task and effectively resolved.\nExtensive experiments demonstrate that MESA yields substantial precision\nimprovement for multiple point matchers in indoor and outdoor downstream tasks,\ne.g. +13.61% for DKM in indoor pose estimation.\n","authors":["Yesheng Zhang","Xu Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.16741v2.pdf","comment":"CVPR24"},{"id":"http://arxiv.org/abs/2312.01068v2","updated":"2024-04-08T14:33:12Z","published":"2023-12-02T08:34:22Z","title":"DPHMs: Diffusion Parametric Head Models for Depth-based Tracking","summary":" We introduce Diffusion Parametric Head Models (DPHMs), a generative model\nthat enables robust volumetric head reconstruction and tracking from monocular\ndepth sequences. While recent volumetric head models, such as NPHMs, can now\nexcel in representing high-fidelity head geometries, tracking and\nreconstructing heads from real-world single-view depth sequences remains very\nchallenging, as the fitting to partial and noisy observations is\nunderconstrained. To tackle these challenges, we propose a latent\ndiffusion-based prior to regularize volumetric head reconstruction and\ntracking. This prior-based regularizer effectively constrains the identity and\nexpression codes to lie on the underlying latent manifold which represents\nplausible head shapes. To evaluate the effectiveness of the diffusion-based\nprior, we collect a dataset of monocular Kinect sequences consisting of various\ncomplex facial expression motions and rapid transitions. We compare our method\nto state-of-the-art tracking methods and demonstrate improved head identity\nreconstruction as well as robust expression tracking.\n","authors":["Jiapeng Tang","Angela Dai","Yinyu Nie","Lev Markhasin","Justus Thies","Matthias Niessner"],"pdf_url":"https://arxiv.org/pdf/2312.01068v2.pdf","comment":"CVPR 2024; homepage: https://tangjiapeng.github.io/projects/DPHMs/"},{"id":"http://arxiv.org/abs/2404.05559v1","updated":"2024-04-08T14:30:42Z","published":"2024-04-08T14:30:42Z","title":"TIM: A Time Interval Machine for Audio-Visual Action Recognition","summary":" Diverse actions give rise to rich audio-visual signals in long videos. Recent\nworks showcase that the two modalities of audio and video exhibit different\ntemporal extents of events and distinct labels. We address the interplay\nbetween the two modalities in long videos by explicitly modelling the temporal\nextents of audio and visual events. We propose the Time Interval Machine (TIM)\nwhere a modality-specific time interval poses as a query to a transformer\nencoder that ingests a long video input. The encoder then attends to the\nspecified interval, as well as the surrounding context in both modalities, in\norder to recognise the ongoing action.\n We test TIM on three long audio-visual video datasets: EPIC-KITCHENS,\nPerception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On\nEPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly\nlarger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we\nshow that TIM can be adapted for action detection, using dense multi-scale\ninterval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and\nshowing strong performance on the Perception Test. Our ablations show the\ncritical role of integrating the two modalities and modelling their time\nintervals in achieving this performance. Code and models at:\nhttps://github.com/JacobChalk/TIM\n","authors":["Jacob Chalk","Jaesung Huh","Evangelos Kazakos","Andrew Zisserman","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2404.05559v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2307.06206v2","updated":"2024-04-08T14:26:52Z","published":"2023-07-12T14:52:21Z","title":"SepVAE: a contrastive VAE to separate pathological patterns from healthy\n ones","summary":" Contrastive Analysis VAE (CA-VAEs) is a family of Variational auto-encoders\n(VAEs) that aims at separating the common factors of variation between a\nbackground dataset (BG) (i.e., healthy subjects) and a target dataset (TG)\n(i.e., patients) from the ones that only exist in the target dataset. To do so,\nthese methods separate the latent space into a set of salient features (i.e.,\nproper to the target dataset) and a set of common features (i.e., exist in both\ndatasets). Currently, all models fail to prevent the sharing of information\nbetween latent spaces effectively and to capture all salient factors of\nvariation. To this end, we introduce two crucial regularization losses: a\ndisentangling term between common and salient representations and a\nclassification term between background and target samples in the salient space.\nWe show a better performance than previous CA-VAEs methods on three medical\napplications and a natural images dataset (CelebA). Code and datasets are\navailable on GitHub https://github.com/neurospin-projects/2023_rlouiset_sepvae.\n","authors":["Robin Louiset","Edouard Duchesnay","Antoine Grigis","Benoit Dufumier","Pietro Gori"],"pdf_url":"https://arxiv.org/pdf/2307.06206v2.pdf","comment":"Workshop on Interpretable ML in Healthcare at International\n Conference on Machine Learning (ICML), Honolulu, Hawaii, USA. 2023"},{"id":"http://arxiv.org/abs/2308.16018v4","updated":"2024-04-08T14:09:27Z","published":"2023-08-30T13:20:54Z","title":"SiT-MLP: A Simple MLP with Point-wise Topology Feature Learning for\n Skeleton-based Action Recognition","summary":" Graph convolution networks (GCNs) have achieved remarkable performance in\nskeleton-based action recognition. However, previous GCN-based methods rely on\nelaborate human priors excessively and construct complex feature aggregation\nmechanisms, which limits the generalizability and effectiveness of networks. To\nsolve these problems, we propose a novel Spatial Topology Gating Unit (STGU),\nan MLP-based variant without extra priors, to capture the co-occurrence\ntopology features that encode the spatial dependency across all joints. In\nSTGU, to learn the point-wise topology features, a new gate-based feature\ninteraction mechanism is introduced to activate the features point-to-point by\nthe attention map generated from the input sample. Based on the STGU, we\npropose the first MLP-based model, SiT-MLP, for skeleton-based action\nrecognition in this work. Compared with previous methods on three large-scale\ndatasets, SiT-MLP achieves competitive performance. In addition, SiT-MLP\nreduces the parameters significantly with favorable results. The code will be\navailable at https://github.com/BUPTSJZhang/SiT?MLP.\n","authors":["Shaojie Zhang","Jianqin Yin","Yonghao Dang","Jiajun Fu"],"pdf_url":"https://arxiv.org/pdf/2308.16018v4.pdf","comment":"Accepted by IEEE TCSVT 2024"},{"id":"http://arxiv.org/abs/2312.07526v2","updated":"2024-04-08T13:40:43Z","published":"2023-12-12T18:55:29Z","title":"RTMO: Towards High-Performance One-Stage Real-Time Multi-Person Pose\n Estimation","summary":" Real-time multi-person pose estimation presents significant challenges in\nbalancing speed and precision. While two-stage top-down methods slow down as\nthe number of people in the image increases, existing one-stage methods often\nfail to simultaneously deliver high accuracy and real-time performance. This\npaper introduces RTMO, a one-stage pose estimation framework that seamlessly\nintegrates coordinate classification by representing keypoints using dual 1-D\nheatmaps within the YOLO architecture, achieving accuracy comparable to\ntop-down methods while maintaining high speed. We propose a dynamic coordinate\nclassifier and a tailored loss function for heatmap learning, specifically\ndesigned to address the incompatibilities between coordinate classification and\ndense prediction models. RTMO outperforms state-of-the-art one-stage pose\nestimators, achieving 1.1% higher AP on COCO while operating about 9 times\nfaster with the same backbone. Our largest model, RTMO-l, attains 74.8% AP on\nCOCO val2017 and 141 FPS on a single V100 GPU, demonstrating its efficiency and\naccuracy. The code and models are available at\nhttps://github.com/open-mmlab/mmpose/tree/main/projects/rtmo.\n","authors":["Peng Lu","Tao Jiang","Yining Li","Xiangtai Li","Kai Chen","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2312.07526v2.pdf","comment":"Accepted at CVPR 2024. Project page:\n https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo"},{"id":"http://arxiv.org/abs/2404.05519v1","updated":"2024-04-08T13:40:01Z","published":"2024-04-08T13:40:01Z","title":"Investigating the Effectiveness of Cross-Attention to Unlock Zero-Shot\n Editing of Text-to-Video Diffusion Models","summary":" With recent advances in image and video diffusion models for content\ncreation, a plethora of techniques have been proposed for customizing their\ngenerated content. In particular, manipulating the cross-attention layers of\nText-to-Image (T2I) diffusion models has shown great promise in controlling the\nshape and location of objects in the scene. Transferring image-editing\ntechniques to the video domain, however, is extremely challenging as object\nmotion and temporal consistency are difficult to capture accurately. In this\nwork, we take a first look at the role of cross-attention in Text-to-Video\n(T2V) diffusion models for zero-shot video editing. While one-shot models have\nshown potential in controlling motion and camera movement, we demonstrate\nzero-shot control over object shape, position and movement in T2V models. We\nshow that despite the limitations of current T2V models, cross-attention\nguidance can be a promising approach for editing videos.\n","authors":["Saman Motamed","Wouter Van Gansbeke","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.05519v1.pdf","comment":"Generative Models for Computer Vision Generative Models for Computer\n Vision CVPR 2024 Workshop"},{"id":"http://arxiv.org/abs/2404.05518v1","updated":"2024-04-08T13:39:12Z","published":"2024-04-08T13:39:12Z","title":"DepthMOT: Depth Cues Lead to a Strong Multi-Object Tracker","summary":" Accurately distinguishing each object is a fundamental goal of Multi-object\ntracking (MOT) algorithms. However, achieving this goal still remains\nchallenging, primarily due to: (i) For crowded scenes with occluded objects,\nthe high overlap of object bounding boxes leads to confusion among closely\nlocated objects. Nevertheless, humans naturally perceive the depth of elements\nin a scene when observing 2D videos. Inspired by this, even though the bounding\nboxes of objects are close on the camera plane, we can differentiate them in\nthe depth dimension, thereby establishing a 3D perception of the objects. (ii)\nFor videos with rapidly irregular camera motion, abrupt changes in object\npositions can result in ID switches. However, if the camera pose are known, we\ncan compensate for the errors in linear motion models. In this paper, we\npropose \\textit{DepthMOT}, which achieves: (i) detecting and estimating scene\ndepth map \\textit{end-to-end}, (ii) compensating the irregular camera motion by\ncamera pose estimation. Extensive experiments demonstrate the superior\nperformance of DepthMOT in VisDrone-MOT and UAVDT datasets. The code will be\navailable at \\url{https://github.com/JackWoo0831/DepthMOT}.\n","authors":["Jiapeng Wu","Yichen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05512v1","updated":"2024-04-08T13:35:14Z","published":"2024-04-08T13:35:14Z","title":"Impact of LiDAR visualisations on semantic segmentation of\n archaeological objects","summary":" Deep learning methods in LiDAR-based archaeological research often leverage\nvisualisation techniques derived from Digital Elevation Models to enhance\ncharacteristics of archaeological objects present in the images. This paper\ninvestigates the impact of visualisations on deep learning performance through\na comprehensive testing framework. The study involves the use of eight semantic\nsegmentation models to evaluate seven diverse visualisations across two study\nareas, encompassing five archaeological classes. Experimental results reveal\nthat the choice of appropriate visualisations can influence performance by up\nto 8%. Yet, pinpointing one visualisation that outperforms the others in\nsegmenting all archaeological classes proves challenging. The observed\nperformance variation, reaching up to 25% across different model\nconfigurations, underscores the importance of thoughtfully selecting model\nconfigurations and LiDAR visualisations for successfully segmenting\narchaeological objects.\n","authors":["Raveerat Jaturapitpornchai","Giulio Poggi","Gregory Sech","Ziga Kokalj","Marco Fiorucci","Arianna Traviglia"],"pdf_url":"https://arxiv.org/pdf/2404.05512v1.pdf","comment":"Accepted to IEEE International Geoscience and Remote Sensing\n Symposium 2024 (IGARSS 2024) @IEEE copyright"},{"id":"http://arxiv.org/abs/2404.05505v1","updated":"2024-04-08T13:27:07Z","published":"2024-04-08T13:27:07Z","title":"Taming Transformers for Realistic Lidar Point Cloud Generation","summary":" Diffusion Models (DMs) have achieved State-Of-The-Art (SOTA) results in the\nLidar point cloud generation task, benefiting from their stable training and\niterative refinement during sampling. However, DMs often fail to realistically\nmodel Lidar raydrop noise due to their inherent denoising process. To retain\nthe strength of iterative sampling while enhancing the generation of raydrop\nnoise, we introduce LidarGRIT, a generative model that uses auto-regressive\ntransformers to iteratively sample the range images in the latent space rather\nthan image space. Furthermore, LidarGRIT utilises VQ-VAE to separately decode\nrange images and raydrop masks. Our results show that LidarGRIT achieves\nsuperior performance compared to SOTA models on KITTI-360 and KITTI odometry\ndatasets. Code available at:https://github.com/hamedhaghighi/LidarGRIT.\n","authors":["Hamed Haghighi","Amir Samadi","Mehrdad Dianati","Valentina Donzella","Kurt Debattista"],"pdf_url":"https://arxiv.org/pdf/2404.05505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08077v2","updated":"2024-04-08T13:23:47Z","published":"2023-11-14T11:05:08Z","title":"Zero-Shot Segmentation of Eye Features Using the Segment Anything Model\n (SAM)","summary":" The advent of foundation models signals a new era in artificial intelligence.\nThe Segment Anything Model (SAM) is the first foundation model for image\nsegmentation. In this study, we evaluate SAM's ability to segment features from\neye images recorded in virtual reality setups. The increasing requirement for\nannotated eye-image datasets presents a significant opportunity for SAM to\nredefine the landscape of data annotation in gaze estimation. Our investigation\ncenters on SAM's zero-shot learning abilities and the effectiveness of prompts\nlike bounding boxes or point clicks. Our results are consistent with studies in\nother domains, demonstrating that SAM's segmentation effectiveness can be\non-par with specialized models depending on the feature, with prompts improving\nits performance, evidenced by an IoU of 93.34% for pupil segmentation in one\ndataset. Foundation models like SAM could revolutionize gaze estimation by\nenabling quick and easy image segmentation, reducing reliance on specialized\nmodels and extensive manual annotation.\n","authors":["Virmarie Maquiling","Sean Anthony Byrne","Diederick C. Niehorster","Marcus Nyström","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2311.08077v2.pdf","comment":"14 pages, 8 figures, 1 table, Accepted to ETRA 2024: ACM Symposium on\n Eye Tracking Research & Applications"},{"id":"http://arxiv.org/abs/2311.16728v2","updated":"2024-04-08T13:17:05Z","published":"2023-11-28T12:19:00Z","title":"Photo-SLAM: Real-time Simultaneous Localization and Photorealistic\n Mapping for Monocular, Stereo, and RGB-D Cameras","summary":" The integration of neural rendering and the SLAM system recently showed\npromising results in joint localization and photorealistic view reconstruction.\nHowever, existing methods, fully relying on implicit representations, are so\nresource-hungry that they cannot run on portable devices, which deviates from\nthe original intention of SLAM. In this paper, we present Photo-SLAM, a novel\nSLAM framework with a hyper primitives map. Specifically, we simultaneously\nexploit explicit geometric features for localization and learn implicit\nphotometric features to represent the texture information of the observed\nenvironment. In addition to actively densifying hyper primitives based on\ngeometric features, we further introduce a Gaussian-Pyramid-based training\nmethod to progressively learn multi-level features, enhancing photorealistic\nmapping performance. The extensive experiments with monocular, stereo, and\nRGB-D datasets prove that our proposed system Photo-SLAM significantly\noutperforms current state-of-the-art SLAM systems for online photorealistic\nmapping, e.g., PSNR is 30% higher and rendering speed is hundreds of times\nfaster in the Replica dataset. Moreover, the Photo-SLAM can run at real-time\nspeed using an embedded platform such as Jetson AGX Orin, showing the potential\nof robotics applications.\n","authors":["Huajian Huang","Longwei Li","Hui Cheng","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2311.16728v2.pdf","comment":"CVPR 2024. Code: https://github.com/HuajianUP/Photo-SLAM - Project\n Page: https://huajianup.github.io/research/Photo-SLAM/"},{"id":"http://arxiv.org/abs/2311.17389v2","updated":"2024-04-08T13:15:03Z","published":"2023-11-29T06:42:12Z","title":"360Loc: A Dataset and Benchmark for Omnidirectional Visual Localization\n with Cross-device Queries","summary":" Portable 360$^\\circ$ cameras are becoming a cheap and efficient tool to\nestablish large visual databases. By capturing omnidirectional views of a\nscene, these cameras could expedite building environment models that are\nessential for visual localization. However, such an advantage is often\noverlooked due to the lack of valuable datasets. This paper introduces a new\nbenchmark dataset, 360Loc, composed of 360$^\\circ$ images with ground truth\nposes for visual localization. We present a practical implementation of\n360$^\\circ$ mapping combining 360$^\\circ$ images with lidar data to generate\nthe ground truth 6DoF poses. 360Loc is the first dataset and benchmark that\nexplores the challenge of cross-device visual positioning, involving\n360$^\\circ$ reference frames, and query frames from pinhole, ultra-wide FoV\nfisheye, and 360$^\\circ$ cameras. We propose a virtual camera approach to\ngenerate lower-FoV query frames from 360$^\\circ$ images, which ensures a fair\ncomparison of performance among different query types in visual localization\ntasks. We also extend this virtual camera approach to feature matching-based\nand pose regression-based methods to alleviate the performance loss caused by\nthe cross-device domain gap, and evaluate its effectiveness against\nstate-of-the-art baselines. We demonstrate that omnidirectional visual\nlocalization is more robust in challenging large-scale scenes with symmetries\nand repetitive structures. These results provide new insights into 360-camera\nmapping and omnidirectional visual localization with cross-device queries.\n","authors":["Huajian Huang","Changkun Liu","Yipeng Zhu","Hui Cheng","Tristan Braud","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2311.17389v2.pdf","comment":"CVPR 2024. Project Page: https://huajianup.github.io/research/360Loc/"},{"id":"http://arxiv.org/abs/2404.05490v1","updated":"2024-04-08T13:11:57Z","published":"2024-04-08T13:11:57Z","title":"Two-Person Interaction Augmentation with Skeleton Priors","summary":" Close and continuous interaction with rich contacts is a crucial aspect of\nhuman activities (e.g. hugging, dancing) and of interest in many domains like\nactivity recognition, motion prediction, character animation, etc. However,\nacquiring such skeletal motion is challenging. While direct motion capture is\nexpensive and slow, motion editing/generation is also non-trivial, as complex\ncontact patterns with topological and geometric constraints have to be\nretained. To this end, we propose a new deep learning method for two-body\nskeletal interaction motion augmentation, which can generate variations of\ncontact-rich interactions with varying body sizes and proportions while\nretaining the key geometric/topological relations between two bodies. Our\nsystem can learn effectively from a relatively small amount of data and\ngeneralize to drastically different skeleton sizes. Through exhaustive\nevaluation and comparison, we show it can generate high-quality motions, has\nstrong generalizability and outperforms traditional optimization-based methods\nand alternative deep learning solutions.\n","authors":["Baiyi Li","Edmond S. L. Ho","Hubert P. H. Shum","He Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00226v2","updated":"2024-04-08T13:05:11Z","published":"2024-03-30T02:56:54Z","title":"Design as Desired: Utilizing Visual Question Answering for Multimodal\n Pre-training","summary":" Multimodal pre-training demonstrates its potential in the medical domain,\nwhich learns medical visual representations from paired medical reports.\nHowever, many pre-training tasks require extra annotations from clinicians, and\nmost of them fail to explicitly guide the model to learn the desired features\nof different pathologies. To the best of our knowledge, we are the first to\nutilize Visual Question Answering (VQA) for multimodal pre-training to guide\nthe framework focusing on targeted pathological features. In this work, we\nleverage descriptions in medical reports to design multi-granular\nquestion-answer pairs associated with different diseases, which assist the\nframework in pre-training without requiring extra annotations from experts. We\nalso propose a novel pre-training framework with a quasi-textual feature\ntransformer, a module designed to transform visual features into a\nquasi-textual space closer to the textual domain via a contrastive learning\nstrategy. This narrows the vision-language gap and facilitates modality\nalignment. Our framework is applied to four downstream tasks: report\ngeneration, classification, segmentation, and detection across five datasets.\nExtensive experiments demonstrate the superiority of our framework compared to\nother state-of-the-art methods. Our code will be released upon acceptance.\n","authors":["Tongkun Su","Jun Li","Xi Zhang","Haibo Jin","Hao Chen","Qiong Wang","Faqin Lv","Baoliang Zhao","Yin Hu"],"pdf_url":"https://arxiv.org/pdf/2404.00226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01941v3","updated":"2024-04-08T12:51:35Z","published":"2024-04-02T13:33:31Z","title":"LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging","summary":" Human pose and shape (HPS) estimation with lensless imaging is not only\nbeneficial to privacy protection but also can be used in covert surveillance\nscenarios due to the small size and simple structure of this device. However,\nthis task presents significant challenges due to the inherent ambiguity of the\ncaptured measurements and lacks effective methods for directly estimating human\npose and shape from lensless data. In this paper, we propose the first\nend-to-end framework to recover 3D human poses and shapes from lensless\nmeasurements to our knowledge. We specifically design a multi-scale lensless\nfeature decoder to decode the lensless measurements through the optically\nencoded mask for efficient feature extraction. We also propose a double-head\nauxiliary supervision mechanism to improve the estimation accuracy of human\nlimb ends. Besides, we establish a lensless imaging system and verify the\neffectiveness of our method on various datasets acquired by our lensless\nimaging system.\n","authors":["Haoyang Ge","Qiao Feng","Hailong Jia","Xiongzheng Li","Xiangjun Yin","You Zhou","Jingyu Yang","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2404.01941v3.pdf","comment":"Accepted to CVPR 2024. More results available at\n https://cic.tju.edu.cn/faculty/likun/projects/LPSNet"},{"id":"http://arxiv.org/abs/2306.14227v2","updated":"2024-04-08T12:50:51Z","published":"2023-06-25T12:15:44Z","title":"A ground-based dataset and a diffusion model for on-orbit low-light\n image enhancement","summary":" On-orbit service is important for maintaining the sustainability of space\nenvironment. Space-based visible camera is an economical and lightweight sensor\nfor situation awareness during on-orbit service. However, it can be easily\naffected by the low illumination environment. Recently, deep learning has\nachieved remarkable success in image enhancement of natural images, but seldom\napplied in space due to the data bottleneck. In this article, we first propose\na dataset of the Beidou Navigation Satellite for on-orbit low-light image\nenhancement (LLIE). In the automatic data collection scheme, we focus on\nreducing domain gap and improving the diversity of the dataset. we collect\nhardware in-the-loop images based on a robotic simulation testbed imitating\nspace lighting conditions. To evenly sample poses of different orientation and\ndistance without collision, a collision-free working space and pose stratified\nsampling is proposed. Afterwards, a novel diffusion model is proposed. To\nenhance the image contrast without over-exposure and blurring details, we\ndesign a fused attention to highlight the structure and dark region. Finally,\nwe compare our method with previous methods using our dataset, which indicates\nthat our method has a better capacity in on-orbit LLIE.\n","authors":["Yiman Zhu","Lu Wang","Jingyi Yuan","Yu Guo"],"pdf_url":"https://arxiv.org/pdf/2306.14227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05468v1","updated":"2024-04-08T12:46:39Z","published":"2024-04-08T12:46:39Z","title":"Mind-to-Image: Projecting Visual Mental Imagination of the Brain from\n fMRI","summary":" The reconstruction of images observed by subjects from fMRI data collected\nduring visual stimuli has made significant strides in the past decade, thanks\nto the availability of extensive fMRI datasets and advancements in generative\nmodels for image generation. However, the application of visual reconstruction\nhas remained limited. Reconstructing visual imagination presents a greater\nchallenge, with potentially revolutionary applications ranging from aiding\nindividuals with disabilities to verifying witness accounts in court. The\nprimary hurdles in this field are the absence of data collection protocols for\nvisual imagery and the lack of datasets on the subject. Traditionally,\nfMRI-to-image relies on data collected from subjects exposed to visual stimuli,\nwhich poses issues for generating visual imagery based on the difference of\nbrain activity between visual stimulation and visual imagery. For the first\ntime, we have compiled a substantial dataset (around 6h of scans) on visual\nimagery along with a proposed data collection protocol. We then train a\nmodified version of an fMRI-to-image model and demonstrate the feasibility of\nreconstructing images from two modes of imagination: from memory and from pure\nimagination. This marks an important step towards creating a technology that\nallow direct reconstruction of visual imagery.\n","authors":["Hugo Caselles-Dupré","Charles Mellerio","Paul Hérent","Alizée Lopez-Persem","Benoit Béranger","Mathieu Soularue","Pierre Fautrel","Gauthier Vernier","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2404.05468v1.pdf","comment":"Pre-print to be updated"},{"id":"http://arxiv.org/abs/2404.05466v1","updated":"2024-04-08T12:44:24Z","published":"2024-04-08T12:44:24Z","title":"Enhancing Lip Reading with Multi-Scale Video and Multi-Encoder","summary":" Automatic lip-reading (ALR) aims to automatically transcribe spoken content\nfrom a speaker's silent lip motion captured in video. Current mainstream\nlip-reading approaches only use a single visual encoder to model input videos\nof a single scale. In this paper, we propose to enhance lipreading by\nincorporating multi-scale video data and multi-encoder. Specifically, we first\npropose a novel multi-scale lip extraction algorithm based on the size of the\nspeaker's face and an enhanced ResNet3D visual front-end (VFE) to extract lip\nfeatures at different scales. For the multi-encoder, in addition to the\nmainstream Transformer and Conformer, we also incorporate the recently proposed\nBranchformer and EBranchformer as visual encoders. In the experiments, we\nexplore the influence of different video data scales and encoders on ALR system\nperformance and fuse the texts transcribed by all ALR systems using recognizer\noutput voting error reduction (ROVER). Finally, our proposed approach placed\nsecond in the ICME 2024 ChatCLR Challenge Task 2, with a 21.52% reduction in\ncharacter error rate (CER) compared to the official baseline on the evaluation\nset.\n","authors":["He Wang","Pengcheng Guo","Xucheng Wan","Huan Zhou","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2404.05466v1.pdf","comment":"6 pages, 3 figures, submitted to ICME2024 GC-ChatCLR"},{"id":"http://arxiv.org/abs/2404.05465v1","updated":"2024-04-08T12:43:32Z","published":"2024-04-08T12:43:32Z","title":"HAMMR: HierArchical MultiModal React agents for generic VQA","summary":" Combining Large Language Models (LLMs) with external specialized tools\n(LLMs+tools) is a recent paradigm to solve multimodal tasks such as Visual\nQuestion Answering (VQA). While this approach was demonstrated to work well\nwhen optimized and evaluated for each individual benchmark, in practice it is\ncrucial for the next generation of real-world AI systems to handle a broad\nrange of multimodal problems. Therefore we pose the VQA problem from a unified\nperspective and evaluate a single system on a varied suite of VQA tasks\nincluding counting, spatial reasoning, OCR-based reasoning, visual pointing,\nexternal knowledge, and more. In this setting, we demonstrate that naively\napplying the LLM+tools approach using the combined set of all tools leads to\npoor results. This motivates us to introduce HAMMR: HierArchical MultiModal\nReact. We start from a multimodal ReAct-based system and make it hierarchical\nby enabling our HAMMR agents to call upon other specialized agents. This\nenhances the compositionality of the LLM+tools approach, which we show to be\ncritical for obtaining high accuracy on generic VQA. Concretely, on our generic\nVQA suite, HAMMR outperforms the naive LLM+tools approach by 19.5%.\nAdditionally, HAMMR achieves state-of-the-art results on this task,\noutperforming the generic standalone PaLI-X VQA model by 5.0%.\n","authors":["Lluis Castrejon","Thomas Mensink","Howard Zhou","Vittorio Ferrari","Andre Araujo","Jasper Uijlings"],"pdf_url":"https://arxiv.org/pdf/2404.05465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05447v1","updated":"2024-04-08T12:29:46Z","published":"2024-04-08T12:29:46Z","title":"Pansharpening of PRISMA products for archaeological prospection","summary":" Hyperspectral data recorded from satellite platforms are often ill-suited for\ngeo-archaeological prospection due to low spatial resolution. The established\npotential of hyperspectral data from airborne sensors in identifying\narchaeological features has, on the other side, generated increased interest in\nenhancing hyperspectral data to achieve higher spatial resolution. This\nimprovement is crucial for detecting traces linked to sub-surface\ngeo-archaeological features and can make satellite hyperspectral acquisitions\nmore suitable for archaeological research. This research assesses the usability\nof pansharpened PRISMA satellite products in geo-archaeological prospections.\nThree pan-sharpening methods (GSA, MTF-GLP and HySure) are compared\nquantitatively and qualitatively and tested over the archaeological landscape\nof Aquileia (Italy). The results suggest that the application of pansharpening\ntechniques makes hyperspectral satellite imagery highly suitable, under certain\nconditions, to the identification of sub-surface archaeological features of\nsmall and large size.\n","authors":["Gregory Sech","Giulio Poggi","Marina Ljubenovic","Marco Fiorucci","Arianna Traviglia"],"pdf_url":"https://arxiv.org/pdf/2404.05447v1.pdf","comment":"Accepted to IEEE International Geoscience and Remote Sensing\n Symposium 2024 (IGARSS 2024) @IEEE copyright"},{"id":"http://arxiv.org/abs/2301.07409v2","updated":"2024-04-08T12:25:10Z","published":"2023-01-18T10:13:29Z","title":"Representing Noisy Image Without Denoising","summary":" A long-standing topic in artificial intelligence is the effective recognition\nof patterns from noisy images. In this regard, the recent data-driven paradigm\nconsiders 1) improving the representation robustness by adding noisy samples in\ntraining phase (i.e., data augmentation) or 2) pre-processing the noisy image\nby learning to solve the inverse problem (i.e., image denoising). However, such\nmethods generally exhibit inefficient process and unstable result, limiting\ntheir practical applications. In this paper, we explore a non-learning paradigm\nthat aims to derive robust representation directly from noisy images, without\nthe denoising as pre-processing. Here, the noise-robust representation is\ndesigned as Fractional-order Moments in Radon space (FMR), with also beneficial\nproperties of orthogonality and rotation invariance. Unlike earlier\ninteger-order methods, our work is a more generic design taking such classical\nmethods as special cases, and the introduced fractional-order parameter offers\ntime-frequency analysis capability that is not available in classical methods.\nFormally, both implicit and explicit paths for constructing the FMR are\ndiscussed in detail. Extensive simulation experiments and an image security\napplication are provided to demonstrate the uniqueness and usefulness of our\nFMR, especially for noise robustness, rotation invariance, and time-frequency\ndiscriminability.\n","authors":["Shuren Qi","Yushu Zhang","Chao Wang","Tao Xiang","Xiaochun Cao","Yong Xiang"],"pdf_url":"https://arxiv.org/pdf/2301.07409v2.pdf","comment":"Accepted by IEEE Transactions on Pattern Analysis and Machine\n Intelligence, 2024"},{"id":"http://arxiv.org/abs/2404.05439v1","updated":"2024-04-08T12:18:01Z","published":"2024-04-08T12:18:01Z","title":"Action-conditioned video data improves predictability","summary":" Long-term video generation and prediction remain challenging tasks in\ncomputer vision, particularly in partially observable scenarios where cameras\nare mounted on moving platforms. The interaction between observed image frames\nand the motion of the recording agent introduces additional complexities. To\naddress these issues, we introduce the Action-Conditioned Video Generation\n(ACVG) framework, a novel approach that investigates the relationship between\nactions and generated image frames through a deep dual Generator-Actor\narchitecture. ACVG generates video sequences conditioned on the actions of\nrobots, enabling exploration and analysis of how vision and action mutually\ninfluence one another in dynamic environments. We evaluate the framework's\neffectiveness on an indoor robot motion dataset which consists of sequences of\nimage frames along with the sequences of actions taken by the robotic agent,\nconducting a comprehensive empirical study comparing ACVG to other\nstate-of-the-art frameworks along with a detailed ablation study.\n","authors":["Meenakshi Sarkar","Debasish Ghose"],"pdf_url":"https://arxiv.org/pdf/2404.05439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05297v2","updated":"2024-04-08T12:17:24Z","published":"2024-03-08T13:24:46Z","title":"PEEB: Part-based Image Classifiers with an Explainable and Editable\n Language Bottleneck","summary":" CLIP-based classifiers rely on the prompt containing a {class name} that is\nknown to the text encoder. Therefore, they perform poorly on new classes or the\nclasses whose names rarely appear on the Internet (e.g., scientific names of\nbirds). For fine-grained classification, we propose PEEB - an explainable and\neditable classifier to (1) express the class name into a set of text\ndescriptors that describe the visual parts of that class; and (2) match the\nembeddings of the detected parts to their textual descriptors in each class to\ncompute a logit score for classification. In a zero-shot setting where the\nclass names are unknown, PEEB outperforms CLIP by a huge margin (~10x in top-1\naccuracy). Compared to part-based classifiers, PEEB is not only the\nstate-of-the-art (SOTA) on the supervised-learning setting (88.80% and 92.20%\naccuracy on CUB-200 and Dogs-120, respectively) but also the first to enable\nusers to edit the text descriptors to form a new classifier without any\nre-training. Compared to concept bottleneck models, PEEB is also the SOTA in\nboth zero-shot and supervised-learning settings.\n","authors":["Thang M. Pham","Peijie Chen","Tin Nguyen","Seunghyun Yoon","Trung Bui","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2403.05297v2.pdf","comment":"Findings of NAACL 2024 (long paper)"},{"id":"http://arxiv.org/abs/2305.10874v3","updated":"2024-04-08T12:17:01Z","published":"2023-05-18T11:06:15Z","title":"Swap Attention in Spatiotemporal Diffusions for Text-to-Video Generation","summary":" With the explosive popularity of AI-generated content (AIGC), video\ngeneration has recently received a lot of attention. Generating videos guided\nby text instructions poses significant challenges, such as modeling the complex\nrelationship between space and time, and the lack of large-scale text-video\npaired data. Existing text-video datasets suffer from limitations in both\ncontent quality and scale, or they are not open-source, rendering them\ninaccessible for study and use. For model design, previous approaches extend\npretrained text-to-image generation models by adding temporal 1D\nconvolution/attention modules for video generation. However, these approaches\noverlook the importance of jointly modeling space and time, inevitably leading\nto temporal distortions and misalignment between texts and videos. In this\npaper, we propose a novel approach that strengthens the interaction between\nspatial and temporal perceptions. In particular, we utilize a swapped\ncross-attention mechanism in 3D windows that alternates the ``query'' role\nbetween spatial and temporal blocks, enabling mutual reinforcement for each\nother. Moreover, to fully unlock model capabilities for high-quality video\ngeneration and promote the development of the field, we curate a large-scale\nand open-source video dataset called HD-VG-130M. This dataset comprises 130\nmillion text-video pairs from the open-domain, ensuring high-definition,\nwidescreen and watermark-free characters. A smaller-scale yet more meticulously\ncleaned subset further enhances the data quality, aiding models in achieving\nsuperior performance. Experimental quantitative and qualitative results\ndemonstrate the superiority of our approach in terms of per-frame quality,\ntemporal correlation, and text-video alignment, with clear margins.\n","authors":["Wenjing Wang","Huan Yang","Zixi Tuo","Huiguo He","Junchen Zhu","Jianlong Fu","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2305.10874v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05426v1","updated":"2024-04-08T11:54:49Z","published":"2024-04-08T11:54:49Z","title":"Test-Time Zero-Shot Temporal Action Localization","summary":" Zero-Shot Temporal Action Localization (ZS-TAL) seeks to identify and locate\nactions in untrimmed videos unseen during training. Existing ZS-TAL methods\ninvolve fine-tuning a model on a large amount of annotated training data. While\neffective, training-based ZS-TAL approaches assume the availability of labeled\ndata for supervised learning, which can be impractical in some applications.\nFurthermore, the training process naturally induces a domain bias into the\nlearned model, which may adversely affect the model's generalization ability to\narbitrary videos. These considerations prompt us to approach the ZS-TAL problem\nfrom a radically novel perspective, relaxing the requirement for training data.\nTo this aim, we introduce a novel method that performs Test-Time adaptation for\nTemporal Action Localization (T3AL). In a nutshell, T3AL adapts a pre-trained\nVision and Language Model (VLM). T3AL operates in three steps. First, a\nvideo-level pseudo-label of the action category is computed by aggregating\ninformation from the entire video. Then, action localization is performed\nadopting a novel procedure inspired by self-supervised learning. Finally,\nframe-level textual descriptions extracted with a state-of-the-art captioning\nmodel are employed for refining the action region proposals. We validate the\neffectiveness of T3AL by conducting experiments on the THUMOS14 and the\nActivityNet-v1.3 datasets. Our results demonstrate that T3AL significantly\noutperforms zero-shot baselines based on state-of-the-art VLMs, confirming the\nbenefit of a test-time adaptation approach.\n","authors":["Benedetta Liberatori","Alessandro Conti","Paolo Rota","Yiming Wang","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2404.05426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05937v3","updated":"2024-04-08T11:46:07Z","published":"2024-02-08T18:59:53Z","title":"InstaGen: Enhancing Object Detection by Training on Synthetic Dataset","summary":" In this paper, we present a novel paradigm to enhance the ability of object\ndetector, e.g., expanding categories or improving detection performance, by\ntraining on synthetic dataset generated from diffusion models. Specifically, we\nintegrate an instance-level grounding head into a pre-trained, generative\ndiffusion model, to augment it with the ability of localising instances in the\ngenerated images. The grounding head is trained to align the text embedding of\ncategory names with the regional visual feature of the diffusion model, using\nsupervision from an off-the-shelf object detector, and a novel self-training\nscheme on (novel) categories not covered by the detector. We conduct thorough\nexperiments to show that, this enhanced version of diffusion model, termed as\nInstaGen, can serve as a data synthesizer, to enhance object detectors by\ntraining on its generated samples, demonstrating superior performance over\nexisting state-of-the-art methods in open-vocabulary (+4.5 AP) and data-sparse\n(+1.2 to 5.2 AP) scenarios. Project page with code:\nhttps://fcjian.github.io/InstaGen.\n","authors":["Chengjian Feng","Yujie Zhong","Zequn Jie","Weidi Xie","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2402.05937v3.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.05414v1","updated":"2024-04-08T11:32:26Z","published":"2024-04-08T11:32:26Z","title":"Two Hands Are Better Than One: Resolving Hand to Hand Intersections via\n Occupancy Networks","summary":" 3D hand pose estimation from images has seen considerable interest from the\nliterature, with new methods improving overall 3D accuracy. One current\nchallenge is to address hand-to-hand interaction where self-occlusions and\nfinger articulation pose a significant problem to estimation. Little work has\napplied physical constraints that minimize the hand intersections that occur as\na result of noisy estimation. This work addresses the intersection of hands by\nexploiting an occupancy network that represents the hand's volume as a\ncontinuous manifold. This allows us to model the probability distribution of\npoints being inside a hand. We designed an intersection loss function to\nminimize the likelihood of hand-to-point intersections. Moreover, we propose a\nnew hand mesh parameterization that is superior to the commonly used MANO model\nin many respects including lower mesh complexity, underlying 3D skeleton\nextraction, watertightness, etc. On the benchmark InterHand2.6M dataset, the\nmodels trained using our intersection loss achieve better results than the\nstate-of-the-art by significantly decreasing the number of hand intersections\nwhile lowering the mean per-joint positional error. Additionally, we\ndemonstrate superior performance for 3D hand uplift on Re:InterHand and SMILE\ndatasets and show reduced hand-to-hand intersections for complex domains such\nas sign-language pose estimation.\n","authors":["Maksym Ivashechkin","Oscar Mendez","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2404.05414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06704v3","updated":"2024-04-08T11:24:30Z","published":"2023-12-10T11:45:45Z","title":"SIFU: Side-view Conditioned Implicit Function for Real-world Usable\n Clothed Human Reconstruction","summary":" Creating high-quality 3D models of clothed humans from single images for\nreal-world applications is crucial. Despite recent advancements, accurately\nreconstructing humans in complex poses or with loose clothing from in-the-wild\nimages, along with predicting textures for unseen areas, remains a significant\nchallenge. A key limitation of previous methods is their insufficient prior\nguidance in transitioning from 2D to 3D and in texture prediction. In response,\nwe introduce SIFU (Side-view Conditioned Implicit Function for Real-world\nUsable Clothed Human Reconstruction), a novel approach combining a Side-view\nDecoupling Transformer with a 3D Consistent Texture Refinement pipeline.SIFU\nemploys a cross-attention mechanism within the transformer, using SMPL-X\nnormals as queries to effectively decouple side-view features in the process of\nmapping 2D features to 3D. This method not only improves the precision of the\n3D models but also their robustness, especially when SMPL-X estimates are not\nperfect. Our texture refinement process leverages text-to-image diffusion-based\nprior to generate realistic and consistent textures for invisible views.\nThrough extensive experiments, SIFU surpasses SOTA methods in both geometry and\ntexture reconstruction, showcasing enhanced robustness in complex scenarios and\nachieving an unprecedented Chamfer and P2S measurement. Our approach extends to\npractical applications such as 3D printing and scene building, demonstrating\nits broad utility in real-world scenarios. Project page\nhttps://river-zhang.github.io/SIFU-projectpage/ .\n","authors":["Zechuan Zhang","Zongxin Yang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2312.06704v3.pdf","comment":"Accepted by CVPR 2024; Project page\n https://river-zhang.github.io/SIFU-projectpage/"},{"id":"http://arxiv.org/abs/2303.13514v3","updated":"2024-04-08T11:22:05Z","published":"2023-03-23T17:59:35Z","title":"SAOR: Single-View Articulated Object Reconstruction","summary":" We introduce SAOR, a novel approach for estimating the 3D shape, texture, and\nviewpoint of an articulated object from a single image captured in the wild.\nUnlike prior approaches that rely on pre-defined category-specific 3D templates\nor tailored 3D skeletons, SAOR learns to articulate shapes from single-view\nimage collections with a skeleton-free part-based model without requiring any\n3D object shape priors. To prevent ill-posed solutions, we propose a\ncross-instance consistency loss that exploits disentangled object shape\ndeformation and articulation. This is helped by a new silhouette-based sampling\nmechanism to enhance viewpoint diversity during training. Our method only\nrequires estimated object silhouettes and relative depth maps from\noff-the-shelf pre-trained networks during training. At inference time, given a\nsingle-view image, it efficiently outputs an explicit mesh representation. We\nobtain improved qualitative and quantitative results on challenging quadruped\nanimals compared to relevant existing work.\n","authors":["Mehmet Aygün","Oisin Mac Aodha"],"pdf_url":"https://arxiv.org/pdf/2303.13514v3.pdf","comment":"Accepted to CVPR 2024, website: https://mehmetaygun.github.io/saor"},{"id":"http://arxiv.org/abs/2404.05409v1","updated":"2024-04-08T11:20:28Z","published":"2024-04-08T11:20:28Z","title":"Anatomical Conditioning for Contrastive Unpaired Image-to-Image\n Translation of Optical Coherence Tomography Images","summary":" For a unified analysis of medical images from different modalities, data\nharmonization using image-to-image (I2I) translation is desired. We study this\nproblem employing an optical coherence tomography (OCT) data set of\nSpectralis-OCT and Home-OCT images. I2I translation is challenging because the\nimages are unpaired, and a bijective mapping does not exist due to the\ninformation discrepancy between both domains. This problem has been addressed\nby the Contrastive Learning for Unpaired I2I Translation (CUT) approach, but it\nreduces semantic consistency. To restore the semantic consistency, we support\nthe style decoder using an additional segmentation decoder. Our approach\nincreases the similarity between the style-translated images and the target\ndistribution. Importantly, we improve the segmentation of biomarkers in\nHome-OCT images in an unsupervised domain adaptation scenario. Our data\nharmonization approach provides potential for the monitoring of diseases, e.g.,\nage related macular disease, using different OCT devices.\n","authors":["Marc S. Seibel","Hristina Uzunova","Timo Kepp","Heinz Handels"],"pdf_url":"https://arxiv.org/pdf/2404.05409v1.pdf","comment":"Accepted at ISBI 2024"},{"id":"http://arxiv.org/abs/2311.10605v2","updated":"2024-04-08T10:59:06Z","published":"2023-11-17T16:01:06Z","title":"CA-Jaccard: Camera-aware Jaccard Distance for Person Re-identification","summary":" Person re-identification (re-ID) is a challenging task that aims to learn\ndiscriminative features for person retrieval. In person re-ID, Jaccard distance\nis a widely used distance metric, especially in re-ranking and clustering\nscenarios. However, we discover that camera variation has a significant\nnegative impact on the reliability of Jaccard distance. In particular, Jaccard\ndistance calculates the distance based on the overlap of relevant neighbors.\nDue to camera variation, intra-camera samples dominate the relevant neighbors,\nwhich reduces the reliability of the neighbors by introducing intra-camera\nnegative samples and excluding inter-camera positive samples. To overcome this\nproblem, we propose a novel camera-aware Jaccard (CA-Jaccard) distance that\nleverages camera information to enhance the reliability of Jaccard distance.\nSpecifically, we design camera-aware k-reciprocal nearest neighbors (CKRNNs) to\nfind k-reciprocal nearest neighbors on the intra-camera and inter-camera\nranking lists, which improves the reliability of relevant neighbors and\nguarantees the contribution of inter-camera samples in the overlap. Moreover,\nwe propose a camera-aware local query expansion (CLQE) to mine reliable samples\nin relevant neighbors by exploiting camera variation as a strong constraint and\nassign these samples higher weights in overlap, further improving the\nreliability. Our CA-Jaccard distance is simple yet effective and can serve as a\ngeneral distance metric for person re-ID methods with high reliability and low\ncomputational cost. Extensive experiments demonstrate the effectiveness of our\nmethod.\n","authors":["Yiyu Chen","Zheyi Fan","Zhaoru Chen","Yixuan Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.10605v2.pdf","comment":"This paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2309.04190v4","updated":"2024-04-08T10:57:42Z","published":"2023-09-08T08:03:42Z","title":"SegmentAnything helps microscopy images based automatic and quantitative\n organoid detection and analysis","summary":" Organoids are self-organized 3D cell clusters that closely mimic the\narchitecture and function of in vivo tissues and organs. Quantification of\norganoid morphology helps in studying organ development, drug discovery, and\ntoxicity assessment. Recent microscopy techniques provide a potent tool to\nacquire organoid morphology features, but manual image analysis remains a labor\nand time-intensive process. Thus, this paper proposes a comprehensive pipeline\nfor microscopy analysis that leverages the SegmentAnything to precisely\ndemarcate individual organoids. Additionally, we introduce a set of\nmorphological properties, including perimeter, area, radius, non-smoothness,\nand non-circularity, allowing researchers to analyze the organoid structures\nquantitatively and automatically. To validate the effectiveness of our\napproach, we conducted tests on bright-field images of human induced\npluripotent stem cells (iPSCs) derived neural-epithelial (NE) organoids. The\nresults obtained from our automatic pipeline closely align with manual organoid\ndetection and measurement, showcasing the capability of our proposed method in\naccelerating organoids morphology analysis.\n","authors":["Xiaodan Xing","Chunling Tang","Yunzhe Guo","Nicholas Kurniawan","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2309.04190v4.pdf","comment":"Replace Figure 4 with the correct version. The original version is\n wrong due to a column name mismatch"},{"id":"http://arxiv.org/abs/2404.05393v1","updated":"2024-04-08T10:52:29Z","published":"2024-04-08T10:52:29Z","title":"PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation","summary":" Beyond class frequency, we recognize the impact of class-wise relationships\namong various class-specific predictions and the imbalance in label masks on\nlong-tailed segmentation learning. To address these challenges, we propose an\ninnovative Pixel-wise Adaptive Training (PAT) technique tailored for\nlong-tailed segmentation. PAT has two key features: 1) class-wise gradient\nmagnitude homogenization, and 2) pixel-wise class-specific loss adaptation\n(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate\nthe imbalance among label masks by ensuring equal consideration of the\nclass-wise impact on model updates. Second, PCLA tackles the detrimental impact\nof both rare classes within the long-tailed distribution and inaccurate\npredictions from previous training stages by encouraging learning classes with\nlow prediction confidence and guarding against forgetting classes with high\nconfidence. This combined approach fosters robust learning while preventing the\nmodel from forgetting previously learned knowledge. PAT exhibits significant\nperformance improvements, surpassing the current state-of-the-art by 2.2% in\nthe NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and\nintersection over union value by 2.07%, with a particularly notable declination\nof 0.39% in detecting rare classes compared to Balance Logits Variation, as\ndemonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and\nNYU.\n","authors":["Khoi Do","Duong Nguyen","Nguyen H. Tran","Viet Dung Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05392v1","updated":"2024-04-08T10:51:29Z","published":"2024-04-08T10:51:29Z","title":"T-DEED: Temporal-Discriminability Enhancer Encoder-Decoder for Precise\n Event Spotting in Sports Videos","summary":" In this paper, we introduce T-DEED, a Temporal-Discriminability Enhancer\nEncoder-Decoder for Precise Event Spotting in sports videos. T-DEED addresses\nmultiple challenges in the task, including the need for discriminability among\nframe representations, high output temporal resolution to maintain prediction\nprecision, and the necessity to capture information at different temporal\nscales to handle events with varying dynamics. It tackles these challenges\nthrough its specifically designed architecture, featuring an encoder-decoder\nfor leveraging multiple temporal scales and achieving high output temporal\nresolution, along with temporal modules designed to increase token\ndiscriminability. Leveraging these characteristics, T-DEED achieves SOTA\nperformance on the FigureSkating and FineDiving datasets.\n","authors":["Artur Xarles","Sergio Escalera","Thomas B. Moeslund","Albert Clapés"],"pdf_url":"https://arxiv.org/pdf/2404.05392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15288v2","updated":"2024-04-08T10:48:22Z","published":"2023-12-23T16:05:47Z","title":"Understanding normalization in contrastive representation learning and\n out-of-distribution detection","summary":" Contrastive representation learning has emerged as an outstanding approach\nfor anomaly detection. In this work, we explore the $\\ell_2$-norm of\ncontrastive features and its applications in out-of-distribution detection. We\npropose a simple method based on contrastive learning, which incorporates\nout-of-distribution data by discriminating against normal samples in the\ncontrastive layer space. Our approach can be applied flexibly as an outlier\nexposure (OE) approach, where the out-of-distribution data is a huge collective\nof random images, or as a fully self-supervised learning approach, where the\nout-of-distribution data is self-generated by applying distribution-shifting\ntransformations. The ability to incorporate additional out-of-distribution\nsamples enables a feasible solution for datasets where AD methods based on\ncontrastive learning generally underperform, such as aerial images or\nmicroscopy images. Furthermore, the high-quality features learned through\ncontrastive learning consistently enhance performance in OE scenarios, even\nwhen the available out-of-distribution dataset is not diverse enough. Our\nextensive experiments demonstrate the superiority of our proposed method under\nvarious scenarios, including unimodal and multimodal settings, with various\nimage datasets.\n","authors":["Tai Le-Gia","Jaehyun Ahn"],"pdf_url":"https://arxiv.org/pdf/2312.15288v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05384v1","updated":"2024-04-08T10:45:29Z","published":"2024-04-08T10:45:29Z","title":"Rethinking the Spatial Inconsistency in Classifier-Free Diffusion\n Guidance","summary":" Classifier-Free Guidance (CFG) has been widely used in text-to-image\ndiffusion models, where the CFG scale is introduced to control the strength of\ntext guidance on the whole image space. However, we argue that a global CFG\nscale results in spatial inconsistency on varying semantic strengths and\nsuboptimal image quality. To address this problem, we present a novel approach,\nSemantic-aware Classifier-Free Guidance (S-CFG), to customize the guidance\ndegrees for different semantic units in text-to-image diffusion models.\nSpecifically, we first design a training-free semantic segmentation method to\npartition the latent image into relatively independent semantic regions at each\ndenoising step. In particular, the cross-attention map in the denoising U-net\nbackbone is renormalized for assigning each patch to the corresponding token,\nwhile the self-attention map is used to complete the semantic regions. Then, to\nbalance the amplification of diverse semantic units, we adaptively adjust the\nCFG scales across different semantic regions to rescale the text guidance\ndegrees into a uniform level. Finally, extensive experiments demonstrate the\nsuperiority of S-CFG over the original CFG strategy on various text-to-image\ndiffusion models, without requiring any extra training cost. our codes are\navailable at https://github.com/SmilesDZgk/S-CFG.\n","authors":["Dazhong Shen","Guanglu Song","Zeyue Xue","Fu-Yun Wang","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05384v1.pdf","comment":"accepted by CVPR-2024"},{"id":"http://arxiv.org/abs/2305.15873v2","updated":"2024-04-08T10:28:38Z","published":"2023-05-25T09:09:32Z","title":"Confronting Ambiguity in 6D Object Pose Estimation via Score-Based\n Diffusion on SE(3)","summary":" Addressing pose ambiguity in 6D object pose estimation from single RGB images\npresents a significant challenge, particularly due to object symmetries or\nocclusions. In response, we introduce a novel score-based diffusion method\napplied to the $SE(3)$ group, marking the first application of diffusion models\nto $SE(3)$ within the image domain, specifically tailored for pose estimation\ntasks. Extensive evaluations demonstrate the method's efficacy in handling pose\nambiguity, mitigating perspective-induced ambiguity, and showcasing the\nrobustness of our surrogate Stein score formulation on $SE(3)$. This\nformulation not only improves the convergence of denoising process but also\nenhances computational efficiency. Thus, we pioneer a promising strategy for 6D\nobject pose estimation.\n","authors":["Tsu-Ching Hsiao","Hao-Wei Chen","Hsuan-Kung Yang","Chun-Yi Lee"],"pdf_url":"https://arxiv.org/pdf/2305.15873v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.05366v1","updated":"2024-04-08T10:05:24Z","published":"2024-04-08T10:05:24Z","title":"CDAD-Net: Bridging Domain Gaps in Generalized Category Discovery","summary":" In Generalized Category Discovery (GCD), we cluster unlabeled samples of\nknown and novel classes, leveraging a training dataset of known classes. A\nsalient challenge arises due to domain shifts between these datasets. To\naddress this, we present a novel setting: Across Domain Generalized Category\nDiscovery (AD-GCD) and bring forth CDAD-NET (Class Discoverer Across Domains)\nas a remedy. CDAD-NET is architected to synchronize potential known class\nsamples across both the labeled (source) and unlabeled (target) datasets, while\nemphasizing the distinct categorization of the target data. To facilitate this,\nwe propose an entropy-driven adversarial learning strategy that accounts for\nthe distance distributions of target samples relative to source-domain class\nprototypes. Parallelly, the discriminative nature of the shared space is upheld\nthrough a fusion of three metric learning objectives. In the source domain, our\nfocus is on refining the proximity between samples and their affiliated class\nprototypes, while in the target domain, we integrate a neighborhood-centric\ncontrastive learning mechanism, enriched with an adept neighborsmining\napproach. To further accentuate the nuanced feature interrelation among\nsemantically aligned images, we champion the concept of conditional image\ninpainting, underscoring the premise that semantically analogous images prove\nmore efficacious to the task than their disjointed counterparts.\nExperimentally, CDAD-NET eclipses existing literature with a performance\nincrement of 8-15% on three AD-GCD benchmarks we present.\n","authors":["Sai Bhargav Rongali","Sarthak Mehrotra","Ankit Jha","Mohamad Hassan N C","Shirsha Bose","Tanisha Gupta","Mainak Singha","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2404.05366v1.pdf","comment":"Accepted in L3D-IVU, CVPR Workshop, 2024"},{"id":"http://arxiv.org/abs/2308.13888v3","updated":"2024-04-08T10:04:29Z","published":"2023-08-26T14:12:19Z","title":"Neural Implicit Morphing of Face Images","summary":" Face morphing is a problem in computer graphics with numerous artistic and\nforensic applications. It is challenging due to variations in pose, lighting,\ngender, and ethnicity. This task consists of a warping for feature alignment\nand a blending for a seamless transition between the warped images. We propose\nto leverage coord-based neural networks to represent such warpings and\nblendings of face images. During training, we exploit the smoothness and\nflexibility of such networks by combining energy functionals employed in\nclassical approaches without discretizations. Additionally, our method is\ntime-dependent, allowing a continuous warping/blending of the images. During\nmorphing inference, we need both direct and inverse transformations of the\ntime-dependent warping. The first (second) is responsible for warping the\ntarget (source) image into the source (target) image. Our neural warping stores\nthose maps in a single network dismissing the need for inverting them. The\nresults of our experiments indicate that our method is competitive with both\nclassical and generative models under the lens of image quality and\nface-morphing detectors. Aesthetically, the resulting images present a seamless\nblending of diverse faces not yet usual in the literature.\n","authors":["Guilherme Schardong","Tiago Novello","Hallison Paz","Iurii Medvedev","Vinícius da Silva","Luiz Velho","Nuno Gonçalves"],"pdf_url":"https://arxiv.org/pdf/2308.13888v3.pdf","comment":"14 pages, 20 figures, accepted for CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05362v1","updated":"2024-04-08T09:54:28Z","published":"2024-04-08T09:54:28Z","title":"Multi-head Attention-based Deep Multiple Instance Learning","summary":" This paper introduces MAD-MIL, a Multi-head Attention-based Deep Multiple\nInstance Learning model, designed for weakly supervised Whole Slide Images\n(WSIs) classification in digital pathology. Inspired by the multi-head\nattention mechanism of the Transformer, MAD-MIL simplifies model complexity\nwhile achieving competitive results against advanced models like CLAM and\nDS-MIL. Evaluated on the MNIST-BAGS and public datasets, including TUPAC16,\nTCGA BRCA, TCGA LUNG, and TCGA KIDNEY, MAD-MIL consistently outperforms ABMIL.\nThis demonstrates enhanced information diversity, interpretability, and\nefficiency in slide representation. The model's effectiveness, coupled with\nfewer trainable parameters and lower computational complexity makes it a\npromising solution for automated pathology workflows. Our code is available at\nhttps://github.com/tueimage/MAD-MIL.\n","authors":["Hassan Keshvarikhojasteh","Josien Pluim","Mitko Veta"],"pdf_url":"https://arxiv.org/pdf/2404.05362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.01585v3","updated":"2024-04-08T09:53:27Z","published":"2023-02-03T07:35:53Z","title":"SegForestNet: Spatial-Partitioning-Based Aerial Image Segmentation","summary":" Aerial image segmentation is the basis for applications such as automatically\ncreating maps or tracking deforestation. In true orthophotos, which are often\nused in these applications, many objects and regions can be approximated well\nby polygons. However, this fact is rarely exploited by state-of-the-art\nsemantic segmentation models. Instead, most models allow unnecessary degrees of\nfreedom in their predictions by allowing arbitrary region shapes. We therefore\npresent a refinement of our deep learning model which predicts binary space\npartitioning trees, an efficient polygon representation. The refinements\ninclude a new feature decoder architecture and a new differentiable BSP tree\nrenderer which both avoid vanishing gradients. Additionally, we designed a\nnovel loss function specifically designed to improve the spatial partitioning\ndefined by the predicted trees. Furthermore, our expanded model can predict\nmultiple trees at once and thus can predict class-specific segmentations. As an\nadditional contribution, we investigate the impact of a non-optimal training\nprocess in comparison to an optimized training process. While model\narchitectures optimized for aerial images, such as PFNet or our own model, show\nan advantage under non-optimal conditions, this advantage disappears under\noptimal training conditions. Despite this observation, our model still makes\nbetter predictions for small rectangular objects, e.g., cars.\n","authors":["Daniel Gritzner","Jörn Ostermann"],"pdf_url":"https://arxiv.org/pdf/2302.01585v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05357v1","updated":"2024-04-08T09:48:02Z","published":"2024-04-08T09:48:02Z","title":"CNN-based Game State Detection for a Foosball Table","summary":" The automation of games using Deep Reinforcement Learning Strategies (DRL) is\na well-known challenge in AI research. While for feature extraction in a video\ngame typically the whole image is used, this is hardly practical for many real\nworld games. Instead, using a smaller game state reducing the dimension of the\nparameter space to include essential parameters only seems to be a promising\napproach. In the game of Foosball, a compact and comprehensive game state\ndescription consists of the positional shifts and rotations of the figures and\nthe position of the ball over time. In particular, velocities and accelerations\ncan be derived from consecutive time samples of the game state. In this paper,\na figure detection system to determine the game state in Foosball is presented.\nWe capture a dataset containing the rotations of the rods which were measured\nusing accelerometers and the positional shifts were derived using traditional\nComputer Vision techniques (in a laboratory setting). This dataset is utilized\nto train Convolutional Neural Network (CNN) based end-to-end regression models\nto predict the rotations and shifts of each rod. We present an evaluation of\nour system using different state-of-the-art CNNs as base architectures for the\nregression model. We show that our system is able to predict the game state\nwith high accuracy. By providing data for both black and white teams, the\npresented system is intended to provide the required data for future\ndevelopments of Imitation Learning techniques w.r.t. to observing human\nplayers.\n","authors":["David Hagens","Jan Knaup","Elke Hergenröther","Andreas Weinmann"],"pdf_url":"https://arxiv.org/pdf/2404.05357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05348v1","updated":"2024-04-08T09:33:40Z","published":"2024-04-08T09:33:40Z","title":"Iterative Refinement Strategy for Automated Data Labeling: Facial\n Landmark Diagnosis in Medical Imaging","summary":" Automated data labeling techniques are crucial for accelerating the\ndevelopment of deep learning models, particularly in complex medical imaging\napplications. However, ensuring accuracy and efficiency remains challenging.\nThis paper presents iterative refinement strategies for automated data labeling\nin facial landmark diagnosis to enhance accuracy and efficiency for deep\nlearning models in medical applications, including dermatology, plastic\nsurgery, and ophthalmology. Leveraging feedback mechanisms and advanced\nalgorithms, our approach iteratively refines initial labels, reducing reliance\non manual intervention while improving label quality. Through empirical\nevaluation and case studies, we demonstrate the effectiveness of our proposed\nstrategies in deep learning tasks across medical imaging domains. Our results\nhighlight the importance of iterative refinement in automated data labeling to\nenhance the capabilities of deep learning systems in medical imaging\napplications.\n","authors":["Yu-Hsi Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13263v2","updated":"2024-04-08T09:31:33Z","published":"2023-06-23T02:19:52Z","title":"Synthetic data shuffling accelerates the convergence of federated\n learning under data heterogeneity","summary":" In federated learning, data heterogeneity is a critical challenge. A\nstraightforward solution is to shuffle the clients' data to homogenize the\ndistribution. However, this may violate data access rights, and how and when\nshuffling can accelerate the convergence of a federated optimization algorithm\nis not theoretically well understood. In this paper, we establish a precise and\nquantifiable correspondence between data heterogeneity and parameters in the\nconvergence rate when a fraction of data is shuffled across clients. We prove\nthat shuffling can quadratically reduce the gradient dissimilarity with respect\nto the shuffling percentage, accelerating convergence. Inspired by the theory,\nwe propose a practical approach that addresses the data access rights issue by\nshuffling locally generated synthetic data. The experimental results show that\nshuffling synthetic data improves the performance of multiple existing\nfederated learning algorithms by a large margin.\n","authors":["Bo Li","Yasin Esfandiari","Mikkel N. Schmidt","Tommy S. Alstrøm","Sebastian U. Stich"],"pdf_url":"https://arxiv.org/pdf/2306.13263v2.pdf","comment":"Accepted at TMLR"},{"id":"http://arxiv.org/abs/2404.05341v1","updated":"2024-04-08T09:27:42Z","published":"2024-04-08T09:27:42Z","title":"Comparative Analysis of Image Enhancement Techniques for Brain Tumor\n Segmentation: Contrast, Histogram, and Hybrid Approaches","summary":" This study systematically investigates the impact of image enhancement\ntechniques on Convolutional Neural Network (CNN)-based Brain Tumor\nSegmentation, focusing on Histogram Equalization (HE), Contrast Limited\nAdaptive Histogram Equalization (CLAHE), and their hybrid variations. Employing\nthe U-Net architecture on a dataset of 3064 Brain MRI images, the research\ndelves into preprocessing steps, including resizing and enhancement, to\noptimize segmentation accuracy. A detailed analysis of the CNN-based U-Net\narchitecture, training, and validation processes is provided. The comparative\nanalysis, utilizing metrics such as Accuracy, Loss, MSE, IoU, and DSC, reveals\nthat the hybrid approach CLAHE-HE consistently outperforms others. Results\nhighlight its superior accuracy (0.9982, 0.9939, 0.9936 for training, testing,\nand validation, respectively) and robust segmentation overlap, with Jaccard\nvalues of 0.9862, 0.9847, and 0.9864, and Dice values of 0.993, 0.9923, and\n0.9932 for the same phases, emphasizing its potential in neuro-oncological\napplications. The study concludes with a call for refinement in segmentation\nmethodologies to further enhance diagnostic precision and treatment planning in\nneuro-oncology.\n","authors":["Shoffan Saifullah","Andri Pranolo","Rafał Dreżewski"],"pdf_url":"https://arxiv.org/pdf/2404.05341v1.pdf","comment":"9 Pages, & Figures, 2 Tables, International Conference on Computer\n Science Electronics and Information (ICCSEI 2023)"},{"id":"http://arxiv.org/abs/2404.05331v1","updated":"2024-04-08T09:18:32Z","published":"2024-04-08T09:18:32Z","title":"Mask-ControlNet: Higher-Quality Image Generation with An Additional Mask\n Prompt","summary":" Text-to-image generation has witnessed great progress, especially with the\nrecent advancements in diffusion models. Since texts cannot provide detailed\nconditions like object appearance, reference images are usually leveraged for\nthe control of objects in the generated images. However, existing methods still\nsuffer limited accuracy when the relationship between the foreground and\nbackground is complicated. To address this issue, we develop a framework termed\nMask-ControlNet by introducing an additional mask prompt. Specifically, we\nfirst employ large vision models to obtain masks to segment the objects of\ninterest in the reference image. Then, the object images are employed as\nadditional prompts to facilitate the diffusion model to better understand the\nrelationship between foreground and background regions during image generation.\nExperiments show that the mask prompts enhance the controllability of the\ndiffusion model to maintain higher fidelity to the reference image while\nachieving better image quality. Comparison with previous text-to-image\ngeneration methods demonstrates our method's superior quantitative and\nqualitative performance on the benchmark datasets.\n","authors":["Zhiqi Huang","Huixin Xiong","Haoyu Wang","Longguang Wang","Zhiheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.05331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05317v1","updated":"2024-04-08T09:08:43Z","published":"2024-04-08T09:08:43Z","title":"WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A\n Conceptual Architecture","summary":" This work proposes a WebXR-based cross-platform conceptual architecture,\nleveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate\nthe development of an open, accessible, and interoperable metaverse. By\nintroducing the concept of spatial web app, this research contributes to the\ndiscourse on the metaverse, offering an architecture that democratizes access\nto virtual environments and extended reality through the web, and aligns with\nTim Berners-Lee's original vision of the World Wide Web as an open platform in\nthe digital realm.\n","authors":["Giuseppe Macario"],"pdf_url":"https://arxiv.org/pdf/2404.05317v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2303.12017v2","updated":"2024-04-08T09:02:40Z","published":"2023-03-21T16:54:01Z","title":"Learning Optical Flow and Scene Flow with Bidirectional Camera-LiDAR\n Fusion","summary":" In this paper, we study the problem of jointly estimating the optical flow\nand scene flow from synchronized 2D and 3D data. Previous methods either employ\na complex pipeline that splits the joint task into independent stages, or fuse\n2D and 3D information in an ``early-fusion'' or ``late-fusion'' manner. Such\none-size-fits-all approaches suffer from a dilemma of failing to fully utilize\nthe characteristic of each modality or to maximize the inter-modality\ncomplementarity. To address the problem, we propose a novel end-to-end\nframework, which consists of 2D and 3D branches with multiple bidirectional\nfusion connections between them in specific layers. Different from previous\nwork, we apply a point-based 3D branch to extract the LiDAR features, as it\npreserves the geometric structure of point clouds. To fuse dense image features\nand sparse point features, we propose a learnable operator named bidirectional\ncamera-LiDAR fusion module (Bi-CLFM). We instantiate two types of the\nbidirectional fusion pipeline, one based on the pyramidal coarse-to-fine\narchitecture (dubbed CamLiPWC), and the other one based on the recurrent\nall-pairs field transforms (dubbed CamLiRAFT). On FlyingThings3D, both CamLiPWC\nand CamLiRAFT surpass all existing methods and achieve up to a 47.9\\% reduction\nin 3D end-point-error from the best published result. Our best-performing\nmodel, CamLiRAFT, achieves an error of 4.26\\% on the KITTI Scene Flow\nbenchmark, ranking 1st among all submissions with much fewer parameters.\nBesides, our methods have strong generalization performance and the ability to\nhandle non-rigid motion. Code is available at\nhttps://github.com/MCG-NJU/CamLiFlow.\n","authors":["Haisong Liu","Tao Lu","Yihui Xu","Jia Liu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2303.12017v2.pdf","comment":"Accepted to TPAMI 2023"},{"id":"http://arxiv.org/abs/2404.05309v1","updated":"2024-04-08T08:57:32Z","published":"2024-04-08T08:57:32Z","title":"CLIPping the Limits: Finding the Sweet Spot for Relevant Images in\n Automated Driving Systems Perception Testing","summary":" Perception systems, especially cameras, are the eyes of automated driving\nsystems. Ensuring that they function reliably and robustly is therefore an\nimportant building block in the automation of vehicles. There are various\napproaches to test the perception of automated driving systems. Ultimately,\nhowever, it always comes down to the investigation of the behavior of\nperception systems under specific input data. Camera images are a crucial part\nof the input data. Image data sets are therefore collected for the testing of\nautomated driving systems, but it is non-trivial to find specific images in\nthese data sets. Thanks to recent developments in neural networks, there are\nnow methods for sorting the images in a data set according to their similarity\nto a prompt in natural language. In order to further automate the provision of\nsearch results, we make a contribution by automating the threshold definition\nin these sorted results and returning only the images relevant to the prompt as\na result. Our focus is on preventing false positives and false negatives\nequally. It is also important that our method is robust and in the case that\nour assumptions are not fulfilled, we provide a fallback solution.\n","authors":["Philipp Rigoll","Laurenz Adolph","Lennart Ries","Eric Sax"],"pdf_url":"https://arxiv.org/pdf/2404.05309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05307v1","updated":"2024-04-08T08:53:54Z","published":"2024-04-08T08:53:54Z","title":"Human Detection from 4D Radar Data in Low-Visibility Field Conditions","summary":" Autonomous driving technology is increasingly being used on public roads and\nin industrial settings such as mines. While it is essential to detect\npedestrians, vehicles, or other obstacles, adverse field conditions negatively\naffect the performance of classical sensors such as cameras or lidars. Radar,\non the other hand, is a promising modality that is less affected by, e.g.,\ndust, smoke, water mist or fog. In particular, modern 4D imaging radars provide\ntarget responses across the range, vertical angle, horizontal angle and Doppler\nvelocity dimensions. We propose TMVA4D, a CNN architecture that leverages this\n4D radar modality for semantic segmentation. The CNN is trained to distinguish\nbetween the background and person classes based on a series of 2D projections\nof the 4D radar data that include the elevation, azimuth, range, and Doppler\nvelocity dimensions. We also outline the process of compiling a novel dataset\nconsisting of data collected in industrial settings with a car-mounted 4D radar\nand describe how the ground-truth labels were generated from reference thermal\nimages. Using TMVA4D on this dataset, we achieve an mIoU score of 78.2% and an\nmDice score of 86.1%, evaluated on the two classes background and person\n","authors":["Mikael Skog","Oleksandr Kotlyar","Vladimír Kubelka","Martin Magnusson"],"pdf_url":"https://arxiv.org/pdf/2404.05307v1.pdf","comment":"Submitted to Radar in Robotics workshop at ICRA 2024"},{"id":"http://arxiv.org/abs/2404.05300v1","updated":"2024-04-08T08:42:47Z","published":"2024-04-08T08:42:47Z","title":"Texture Classification Network Integrating Adaptive Wavelet Transform","summary":" Graves' disease is a common condition that is diagnosed clinically by\ndetermining the smoothness of the thyroid texture and its morphology in\nultrasound images. Currently, the most widely used approach for the automated\ndiagnosis of Graves' disease utilizes Convolutional Neural Networks (CNNs) for\nboth feature extraction and classification. However, these methods demonstrate\nlimited efficacy in capturing texture features. Given the high capacity of\nwavelets in describing texture features, this research integrates learnable\nwavelet modules utilizing the Lifting Scheme into CNNs and incorporates a\nparallel wavelet branch into the ResNet18 model to enhance texture feature\nextraction. Our model can analyze texture features in spatial and frequency\ndomains simultaneously, leading to optimized classification accuracy. We\nconducted experiments on collected ultrasound datasets and publicly available\nnatural image texture datasets, our proposed network achieved 97.27% accuracy\nand 95.60% recall on ultrasound datasets, 60.765% accuracy on natural image\ntexture datasets, surpassing the accuracy of ResNet and conrming the\neffectiveness of our approach.\n","authors":["Su-Xi Yu","Jing-Yuan He","Yi Wang","Yu-Jiao Cai","Jun Yang","Bo Lin","Wei-Bin Yang","Jian Ruan"],"pdf_url":"https://arxiv.org/pdf/2404.05300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05290v1","updated":"2024-04-08T08:28:19Z","published":"2024-04-08T08:28:19Z","title":"MindSet: Vision. A toolbox for testing DNNs on key psychological\n experiments","summary":" Multiple benchmarks have been developed to assess the alignment between deep\nneural networks (DNNs) and human vision. In almost all cases these benchmarks\nare observational in the sense they are composed of behavioural and brain\nresponses to naturalistic images that have not been manipulated to test\nhypotheses regarding how DNNs or humans perceive and identify objects. Here we\nintroduce the toolbox MindSet: Vision, consisting of a collection of image\ndatasets and related scripts designed to test DNNs on 30 psychological\nfindings. In all experimental conditions, the stimuli are systematically\nmanipulated to test specific hypotheses regarding human visual perception and\nobject recognition. In addition to providing pre-generated datasets of images,\nwe provide code to regenerate these datasets, offering many configurable\nparameters which greatly extend the dataset versatility for different research\ncontexts, and code to facilitate the testing of DNNs on these image datasets\nusing three different methods (similarity judgments, out-of-distribution\nclassification, and decoder method), accessible at\nhttps://github.com/MindSetVision/mindset-vision. We test ResNet-152 on each of\nthese methods as an example of how the toolbox can be used.\n","authors":["Valerio Biscione","Dong Yin","Gaurav Malhotra","Marin Dujmovic","Milton L. Montero","Guillermo Puebla","Federico Adolfi","Rachel F. Heaton","John E. Hummel","Benjamin D. Evans","Karim Habashy","Jeffrey S. Bowers"],"pdf_url":"https://arxiv.org/pdf/2404.05290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05285v1","updated":"2024-04-08T08:20:53Z","published":"2024-04-08T08:20:53Z","title":"Detecting Every Object from Events","summary":" Object detection is critical in autonomous driving, and it is more practical\nyet challenging to localize objects of unknown categories: an endeavour known\nas Class-Agnostic Object Detection (CAOD). Existing studies on CAOD\npredominantly rely on ordinary cameras, but these frame-based sensors usually\nhave high latency and limited dynamic range, leading to safety risks in\nreal-world scenarios. In this study, we turn to a new modality enabled by the\nso-called event camera, featured by its sub-millisecond latency and high\ndynamic range, for robust CAOD. We propose Detecting Every Object in Events\n(DEOE), an approach tailored for achieving high-speed, class-agnostic\nopen-world object detection in event-based vision. Built upon the fast\nevent-based backbone: recurrent vision transformer, we jointly consider the\nspatial and temporal consistencies to identify potential objects. The\ndiscovered potential objects are assimilated as soft positive samples to avoid\nbeing suppressed as background. Moreover, we introduce a disentangled\nobjectness head to separate the foreground-background classification and novel\nobject discovery tasks, enhancing the model's generalization in localizing\nnovel objects while maintaining a strong ability to filter out the background.\nExtensive experiments confirm the superiority of our proposed DEOE in\ncomparison with three strong baseline methods that integrate the\nstate-of-the-art event-based object detector with advancements in RGB-based\nCAOD. Our code is available at https://github.com/Hatins/DEOE.\n","authors":["Haitian Zhang","Chang Xu","Xinya Wang","Bingde Liu","Guang Hua","Lei Yu","Wen Yang"],"pdf_url":"https://arxiv.org/pdf/2404.05285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19428v3","updated":"2024-04-08T08:18:33Z","published":"2024-03-28T13:58:05Z","title":"Burst Super-Resolution with Diffusion Models for Improving Perceptual\n Quality","summary":" While burst LR images are useful for improving the SR image quality compared\nwith a single LR image, prior SR networks accepting the burst LR images are\ntrained in a deterministic manner, which is known to produce a blurry SR image.\nIn addition, it is difficult to perfectly align the burst LR images, making the\nSR image more blurry. Since such blurry images are perceptually degraded, we\naim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity\nimages can be reconstructed by diffusion models. However, prior SR methods\nusing the diffusion model are not properly optimized for the burst SR task.\nSpecifically, the reverse process starting from a random sample is not\noptimized for image enhancement and restoration methods, including burst SR. In\nour proposed method, on the other hand, burst LR features are used to\nreconstruct the initial burst SR image that is fed into an intermediate step in\nthe diffusion model. This reverse process from the intermediate step 1) skips\ndiffusion steps for reconstructing the global structure of the image and 2)\nfocuses on steps for refining detailed textures. Our experimental results\ndemonstrate that our method can improve the scores of the perceptual quality\nmetrics. Code: https://github.com/placerkyo/BSRD\n","authors":["Kyotaro Tokoro","Kazutoshi Akita","Norimichi Ukita"],"pdf_url":"https://arxiv.org/pdf/2403.19428v3.pdf","comment":"Accepted to IJCNN 2024 (International Joint Conference on Neural\n Networks)"},{"id":"http://arxiv.org/abs/2404.05280v1","updated":"2024-04-08T08:11:56Z","published":"2024-04-08T08:11:56Z","title":"MOSE: Boosting Vision-based Roadside 3D Object Detection with Scene Cues","summary":" 3D object detection based on roadside cameras is an additional way for\nautonomous driving to alleviate the challenges of occlusion and short\nperception range from vehicle cameras. Previous methods for roadside 3D object\ndetection mainly focus on modeling the depth or height of objects, neglecting\nthe stationary of cameras and the characteristic of inter-frame consistency. In\nthis work, we propose a novel framework, namely MOSE, for MOnocular 3D object\ndetection with Scene cuEs. The scene cues are the frame-invariant\nscene-specific features, which are crucial for object localization and can be\nintuitively regarded as the height between the surface of the real road and the\nvirtual ground plane. In the proposed framework, a scene cue bank is designed\nto aggregate scene cues from multiple frames of the same scene with a carefully\ndesigned extrinsic augmentation strategy. Then, a transformer-based decoder\nlifts the aggregated scene cues as well as the 3D position embeddings for 3D\nobject location, which boosts generalization ability in heterologous scenes.\nThe extensive experiment results on two public benchmarks demonstrate the\nstate-of-the-art performance of the proposed method, which surpasses the\nexisting methods by a large margin.\n","authors":["Xiahan Chen","Mingjian Chen","Sanli Tang","Yi Niu","Jiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.05280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00916v2","updated":"2024-04-08T08:08:43Z","published":"2024-04-01T04:43:45Z","title":"Gyro-based Neural Single Image Deblurring","summary":" In this paper, we present GyroDeblurNet, a novel single image deblurring\nmethod that utilizes a gyro sensor to effectively resolve the ill-posedness of\nimage deblurring. The gyro sensor provides valuable information about camera\nmotion during exposure time that can significantly improve deblurring quality.\nHowever, effectively exploiting real-world gyro data is challenging due to\nsignificant errors from various sources including sensor noise, the disparity\nbetween the positions of a camera module and a gyro sensor, the absence of\ntranslational motion information, and moving objects whose motions cannot be\ncaptured by a gyro sensor. To handle gyro error, GyroDeblurNet is equipped with\ntwo novel neural network blocks: a gyro refinement block and a gyro deblurring\nblock. The gyro refinement block refines the error-ridden gyro data using the\nblur information from the input image. On the other hand, the gyro deblurring\nblock removes blur from the input image using the refined gyro data and further\ncompensates for gyro error by leveraging the blur information from the input\nimage. For training a neural network with erroneous gyro data, we propose a\ntraining strategy based on the curriculum learning. We also introduce a novel\ngyro data embedding scheme to represent real-world intricate camera shakes.\nFinally, we present a synthetic dataset and a real dataset for the training and\nevaluation of gyro-based single image deblurring. Our experiments demonstrate\nthat our approach achieves state-of-the-art deblurring quality by effectively\nutilizing erroneous gyro data.\n","authors":["Heemin Yang","Jaesung Rim","Seungyong Lee","Seung-Hwan Baek","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2404.00916v2.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.05274v1","updated":"2024-04-08T08:04:44Z","published":"2024-04-08T08:04:44Z","title":"Deep Optics for Video Snapshot Compressive Imaging","summary":" Video snapshot compressive imaging (SCI) aims to capture a sequence of video\nframes with only a single shot of a 2D detector, whose backbones rest in\noptical modulation patterns (also known as masks) and a computational\nreconstruction algorithm. Advanced deep learning algorithms and mature hardware\nare putting video SCI into practical applications. Yet, there are two clouds in\nthe sunshine of SCI: i) low dynamic range as a victim of high temporal\nmultiplexing, and ii) existing deep learning algorithms' degradation on real\nsystem. To address these challenges, this paper presents a deep optics\nframework to jointly optimize masks and a reconstruction network. Specifically,\nwe first propose a new type of structural mask to realize motion-aware and\nfull-dynamic-range measurement. Considering the motion awareness property in\nmeasurement domain, we develop an efficient network for video SCI\nreconstruction using Transformer to capture long-term temporal dependencies,\ndubbed Res2former. Moreover, sensor response is introduced into the forward\nmodel of video SCI to guarantee end-to-end model training close to real system.\nFinally, we implement the learned structural masks on a digital micro-mirror\ndevice. Experimental results on synthetic and real data validate the\neffectiveness of the proposed framework. We believe this is a milestone for\nreal-world video SCI. The source code and data are available at\nhttps://github.com/pwangcs/DeepOpticsSCI.\n","authors":["Ping Wang","Lishun Wang","Xin Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.05274v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2404.05268v1","updated":"2024-04-08T07:59:04Z","published":"2024-04-08T07:59:04Z","title":"MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation","summary":" Customized text-to-image generation aims to synthesize instantiations of\nuser-specified concepts and has achieved unprecedented progress in handling\nindividual concept. However, when extending to multiple customized concepts,\nexisting methods exhibit limitations in terms of flexibility and fidelity, only\naccommodating the combination of limited types of models and potentially\nresulting in a mix of characteristics from different concepts. In this paper,\nwe introduce the Multi-concept guidance for Multi-concept customization, termed\nMC$^2$, for improved flexibility and fidelity. MC$^2$ decouples the\nrequirements for model architecture via inference time optimization, allowing\nthe integration of various heterogeneous single-concept customized models. It\nadaptively refines the attention weights between visual and textual tokens,\ndirecting image regions to focus on their associated words while diminishing\nthe impact of irrelevant ones. Extensive experiments demonstrate that MC$^2$\neven surpasses previous methods that require additional training in terms of\nconsistency with input prompt and reference images. Moreover, MC$^2$ can be\nextended to elevate the compositional capabilities of text-to-image generation,\nyielding appealing results. Code will be publicly available at\nhttps://github.com/JIANGJiaXiu/MC-2.\n","authors":["Jiaxiu Jiang","Yabo Zhang","Kailai Feng","Xiaohe Wu","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.05268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05264v1","updated":"2024-04-08T07:54:18Z","published":"2024-04-08T07:54:18Z","title":"Unbridled Icarus: A Survey of the Potential Perils of Image Inputs in\n Multimodal Large Language Model Security","summary":" Multimodal Large Language Models (MLLMs) demonstrate remarkable capabilities\nthat increasingly influence various aspects of our daily lives, constantly\ndefining the new boundary of Artificial General Intelligence (AGI). Image\nmodalities, enriched with profound semantic information and a more continuous\nmathematical nature compared to other modalities, greatly enhance the\nfunctionalities of MLLMs when integrated. However, this integration serves as a\ndouble-edged sword, providing attackers with expansive vulnerabilities to\nexploit for highly covert and harmful attacks. The pursuit of reliable AI\nsystems like powerful MLLMs has emerged as a pivotal area of contemporary\nresearch. In this paper, we endeavor to demostrate the multifaceted risks\nassociated with the incorporation of image modalities into MLLMs. Initially, we\ndelineate the foundational components and training processes of MLLMs.\nSubsequently, we construct a threat model, outlining the security\nvulnerabilities intrinsic to MLLMs. Moreover, we analyze and summarize existing\nscholarly discourses on MLLMs' attack and defense mechanisms, culminating in\nsuggestions for the future research on MLLM security. Through this\ncomprehensive analysis, we aim to deepen the academic understanding of MLLM\nsecurity challenges and propel forward the development of trustworthy MLLM\nsystems.\n","authors":["Yihe Fan","Yuxin Cao","Ziyu Zhao","Ziyao Liu","Shaofeng Li"],"pdf_url":"https://arxiv.org/pdf/2404.05264v1.pdf","comment":"8 pages, 1 figure"},{"id":"http://arxiv.org/abs/2404.00936v3","updated":"2024-04-08T07:52:38Z","published":"2024-04-01T05:46:15Z","title":"A Comprehensive Review of Knowledge Distillation in Computer Vision","summary":" Deep learning techniques have been demonstrated to surpass preceding\ncutting-edge machine learning techniques in recent years, with computer vision\nbeing one of the most prominent examples. However, deep learning models suffer\nfrom significant drawbacks when deployed in resource-constrained environments\ndue to their large model size and high complexity. Knowledge Distillation is\none of the prominent solutions to overcome this challenge. This review paper\nexamines the current state of research on knowledge distillation, a technique\nfor compressing complex models into smaller and simpler ones. The paper\nprovides an overview of the major principles and techniques associated with\nknowledge distillation and reviews the applications of knowledge distillation\nin the domain of computer vision. The review focuses on the benefits of\nknowledge distillation, as well as the problems that must be overcome to\nimprove its effectiveness.\n","authors":["Sheikh Musa Kaleem","Tufail Rouf","Gousia Habib","Tausifa jan Saleem","Brejesh Lall"],"pdf_url":"https://arxiv.org/pdf/2404.00936v3.pdf","comment":"36 pages ,10 figures"},{"id":"http://arxiv.org/abs/2309.03467v2","updated":"2024-04-08T07:49:47Z","published":"2023-09-07T03:22:59Z","title":"Autoregressive Omni-Aware Outpainting for Open-Vocabulary 360-Degree\n Image Generation","summary":" A 360-degree (omni-directional) image provides an all-encompassing spherical\nview of a scene. Recently, there has been an increasing interest in\nsynthesising 360-degree images from conventional narrow field of view (NFoV)\nimages captured by digital cameras and smartphones, for providing immersive\nexperiences in various scenarios such as virtual reality. Yet, existing methods\ntypically fall short in synthesizing intricate visual details or ensure the\ngenerated images align consistently with user-provided prompts. In this study,\nautoregressive omni-aware generative network (AOG-Net) is proposed for\n360-degree image generation by out-painting an incomplete 360-degree image\nprogressively with NFoV and text guidances joinly or individually. This\nautoregressive scheme not only allows for deriving finer-grained and\ntext-consistent patterns by dynamically generating and adjusting the process\nbut also offers users greater flexibility to edit their conditions throughout\nthe generation process. A global-local conditioning mechanism is devised to\ncomprehensively formulate the outpainting guidance in each autoregressive step.\nText guidances, omni-visual cues, NFoV inputs and omni-geometry are encoded and\nfurther formulated with cross-attention based transformers into a global stream\nand a local stream into a conditioned generative backbone model. As AOG-Net is\ncompatible to leverage large-scale models for the conditional encoder and the\ngenerative prior, it enables the generation to use extensive open-vocabulary\ntext guidances. Comprehensive experiments on two commonly used 360-degree image\ndatasets for both indoor and outdoor settings demonstrate the state-of-the-art\nperformance of our proposed method. Our code will be made publicly available.\n","authors":["Zhuqiang Lu","Kun Hu","Chaoyue Wang","Lei Bai","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03467v2.pdf","comment":"Accepted by AAAI 24"},{"id":"http://arxiv.org/abs/2404.05258v1","updated":"2024-04-08T07:47:28Z","published":"2024-04-08T07:47:28Z","title":"Unsupervised Band Selection Using Fused HSI and LiDAR Attention\n Integrating With Autoencoder","summary":" Band selection in hyperspectral imaging (HSI) is critical for optimising data\nprocessing and enhancing analytical accuracy. Traditional approaches have\npredominantly concentrated on analysing spectral and pixel characteristics\nwithin individual bands independently. These approaches overlook the potential\nbenefits of integrating multiple data sources, such as Light Detection and\nRanging (LiDAR), and is further challenged by the limited availability of\nlabeled data in HSI processing, which represents a significant obstacle. To\naddress these challenges, this paper introduces a novel unsupervised band\nselection framework that incorporates attention mechanisms and an Autoencoder\nfor reconstruction-based band selection. Our methodology distinctively\nintegrates HSI with LiDAR data through an attention score, using a\nconvolutional Autoencoder to process the combined feature mask. This fusion\neffectively captures essential spatial and spectral features and reduces\nredundancy in hyperspectral datasets. A comprehensive comparative analysis of\nour innovative fused band selection approach is performed against existing\nunsupervised band selection and fusion models. We used data sets such as\nHouston 2013, Trento, and MUUFLE for our experiments. The results demonstrate\nthat our method achieves superior classification accuracy and significantly\noutperforms existing models. This enhancement in HSI band selection,\nfacilitated by the incorporation of LiDAR features, underscores the\nconsiderable advantages of integrating features from different sources.\n","authors":["Judy X Yang","Jun Zhou","Jing Wang","Hui Tian","Alan Wee Chung Liew"],"pdf_url":"https://arxiv.org/pdf/2404.05258v1.pdf","comment":"13 pages, 13figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.05256v1","updated":"2024-04-08T07:43:23Z","published":"2024-04-08T07:43:23Z","title":"Text-to-Image Synthesis for Any Artistic Styles: Advancements in\n Personalized Artistic Image Generation via Subdivision and Dual Binding","summary":" Recent advancements in text-to-image models, such as Stable Diffusion, have\ndemonstrated their ability to synthesize visual images through natural language\nprompts. One approach of personalizing text-to-image models, exemplified by\nDreamBooth, fine-tunes the pre-trained model by binding unique text identifiers\nwith a few images of a specific subject. Although existing fine-tuning methods\nhave demonstrated competence in rendering images according to the styles of\nfamous painters, it is still challenging to learn to produce images\nencapsulating distinct art styles due to abstract and broad visual perceptions\nof stylistic attributes such as lines, shapes, textures, and colors. In this\npaper, we introduce a new method, Single-StyleForge, for personalization. It\nfine-tunes pre-trained text-to-image diffusion models to generate diverse\nimages in specified styles from text prompts. By using around 15-20 images of\nthe target style, the approach establishes a foundational binding of a unique\ntoken identifier with a broad range of the target style. It also utilizes\nauxiliary images to strengthen this binding, resulting in offering specific\nguidance on representing elements such as persons in a target style-consistent\nmanner. In addition, we present ways to improve the quality of style and\ntext-image alignment through a method called Multi-StyleForge, which inherits\nthe strategy used in StyleForge and learns tokens in multiple. Experimental\nevaluation conducted on six distinct artistic styles demonstrates substantial\nimprovements in both the quality of generated images and the perceptual\nfidelity metrics, such as FID, KID, and CLIP scores.\n","authors":["Junseo Park","Beomseok Ko","Hyeryung Jang"],"pdf_url":"https://arxiv.org/pdf/2404.05256v1.pdf","comment":"20 pages, 12 figuers"},{"id":"http://arxiv.org/abs/2404.05253v1","updated":"2024-04-08T07:34:39Z","published":"2024-04-08T07:34:39Z","title":"CodeEnhance: A Codebook-Driven Approach for Low-Light Image Enhancement","summary":" Low-light image enhancement (LLIE) aims to improve low-illumination images.\nHowever, existing methods face two challenges: (1) uncertainty in restoration\nfrom diverse brightness degradations; (2) loss of texture and color information\ncaused by noise suppression and light enhancement. In this paper, we propose a\nnovel enhancement approach, CodeEnhance, by leveraging quantized priors and\nimage refinement to address these challenges. In particular, we reframe LLIE as\nlearning an image-to-code mapping from low-light images to discrete codebook,\nwhich has been learned from high-quality images. To enhance this process, a\nSemantic Embedding Module (SEM) is introduced to integrate semantic information\nwith low-level features, and a Codebook Shift (CS) mechanism, designed to adapt\nthe pre-learned codebook to better suit the distinct characteristics of our\nlow-light dataset. Additionally, we present an Interactive Feature\nTransformation (IFT) module to refine texture and color information during\nimage reconstruction, allowing for interactive enhancement based on user\npreferences. Extensive experiments on both real-world and synthetic benchmarks\ndemonstrate that the incorporation of prior knowledge and controllable\ninformation transfer significantly enhances LLIE performance in terms of\nquality and fidelity. The proposed CodeEnhance exhibits superior robustness to\nvarious degradations, including uneven illumination, noise, and color\ndistortion.\n","authors":["Xu Wu","XianXu Hou","Zhihui Lai","Jie Zhou","Ya-nan Zhang","Witold Pedrycz","Linlin Shen"],"pdf_url":"https://arxiv.org/pdf/2404.05253v1.pdf","comment":"10 pages, 13 figures"},{"id":"http://arxiv.org/abs/2312.03203v3","updated":"2024-04-08T07:19:52Z","published":"2023-12-06T00:46:30Z","title":"Feature 3DGS: Supercharging 3D Gaussian Splatting to Enable Distilled\n Feature Fields","summary":" 3D scene representations have gained immense popularity in recent years.\nMethods that use Neural Radiance fields are versatile for traditional tasks\nsuch as novel view synthesis. In recent times, some work has emerged that aims\nto extend the functionality of NeRF beyond view synthesis, for semantically\naware tasks such as editing and segmentation using 3D feature field\ndistillation from 2D foundation models. However, these methods have two major\nlimitations: (a) they are limited by the rendering speed of NeRF pipelines, and\n(b) implicitly represented feature fields suffer from continuity artifacts\nreducing feature quality. Recently, 3D Gaussian Splatting has shown\nstate-of-the-art performance on real-time radiance field rendering. In this\nwork, we go one step further: in addition to radiance field rendering, we\nenable 3D Gaussian splatting on arbitrary-dimension semantic features via 2D\nfoundation model distillation. This translation is not straightforward: naively\nincorporating feature fields in the 3DGS framework encounters significant\nchallenges, notably the disparities in spatial resolution and channel\nconsistency between RGB images and feature maps. We propose architectural and\ntraining changes to efficiently avert this problem. Our proposed method is\ngeneral, and our experiments showcase novel view semantic segmentation,\nlanguage-guided editing and segment anything through learning feature fields\nfrom state-of-the-art 2D foundation models such as SAM and CLIP-LSeg. Across\nexperiments, our distillation method is able to provide comparable or better\nresults, while being significantly faster to both train and render.\nAdditionally, to the best of our knowledge, we are the first method to enable\npoint and bounding-box prompting for radiance field manipulation, by leveraging\nthe SAM model. Project website at: https://feature-3dgs.github.io/\n","authors":["Shijie Zhou","Haoran Chang","Sicheng Jiang","Zhiwen Fan","Zehao Zhu","Dejia Xu","Pradyumna Chari","Suya You","Zhangyang Wang","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2312.03203v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05238v1","updated":"2024-04-08T07:09:15Z","published":"2024-04-08T07:09:15Z","title":"Allowing humans to interactively guide machines where to look does not\n always improve a human-AI team's classification accuracy","summary":" Via thousands of papers in Explainable AI (XAI), attention maps\n\\cite{vaswani2017attention} and feature attribution maps \\cite{bansal2020sam}\nhave been established as a common means for explaining the input features that\nare important to AI's decisions. It is an interesting but unexplored question\nwhether allowing users to edit the importance scores of input features at test\ntime would improve the human-AI team's accuracy on downstream tasks. In this\npaper, we address this question by taking CHM-Corr, a state-of-the-art,\nante-hoc explanation method \\cite{taesiri2022visual} that first predicts\npatch-wise correspondences between the input and the training-set images, and\nthen uses them to make classification decisions. We build an interactive\ninterface on top of CHM-Corr, enabling users to directly edit the initial\nfeature attribution map provided by CHM-Corr. Via our CHM-Corr++ interface,\nusers gain insights into if, when, and how the model changes its outputs,\nenhancing understanding beyond static explanations. Our user study with 18\nmachine learning researchers who performed $\\sim$1,400 decisions shows that our\ninteractive approach does not improve user accuracy on CUB-200 bird image\nclassification over static explanations. This challenges the belief that\ninteractivity inherently boosts XAI\neffectiveness~\\cite{sokol2020one,sun2022exploring,shen2024towards,singh2024rethinking,mindlin2024beyond,lakkaraju2022rethinking,cheng2019explaining,liu2021understanding}\nand raises needs for future research. Our work contributes to the field by\nopen-sourcing an interactive tool for manipulating model attention, and it lays\nthe groundwork for future research to enable effective human-AI interaction in\ncomputer vision. We release code and data on\n\\href{https://anonymous.4open.science/r/CHMCorrPlusPlus/}{github}. Our\ninterface are available \\href{http://137.184.82.109:7080/}{here}.\n","authors":["Giang Nguyen","Mohammad Reza Taesiri","Sunnie S. Y. Kim","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05238v1.pdf","comment":"Accepted for presentation at the XAI4CV Workshop, part of the CVPR\n 2024 proceedings"},{"id":"http://arxiv.org/abs/2312.07246v2","updated":"2024-04-08T07:07:02Z","published":"2023-12-12T13:22:44Z","title":"Unifying Correspondence, Pose and NeRF for Pose-Free Novel View\n Synthesis from Stereo Pairs","summary":" This work delves into the task of pose-free novel view synthesis from stereo\npairs, a challenging and pioneering task in 3D vision. Our innovative\nframework, unlike any before, seamlessly integrates 2D correspondence matching,\ncamera pose estimation, and NeRF rendering, fostering a synergistic enhancement\nof these tasks. We achieve this through designing an architecture that utilizes\na shared representation, which serves as a foundation for enhanced 3D geometry\nunderstanding. Capitalizing on the inherent interplay between the tasks, our\nunified framework is trained end-to-end with the proposed training strategy to\nimprove overall model accuracy. Through extensive evaluations across diverse\nindoor and outdoor scenes from two real-world datasets, we demonstrate that our\napproach achieves substantial improvement over previous methodologies,\nespecially in scenarios characterized by extreme viewpoint changes and the\nabsence of accurate camera poses.\n","authors":["Sunghwan Hong","Jaewoo Jung","Heeseong Shin","Jiaolong Yang","Seungryong Kim","Chong Luo"],"pdf_url":"https://arxiv.org/pdf/2312.07246v2.pdf","comment":"Project page: https://ku-cvlab.github.io/CoPoNeRF/ CVPR2024 camera\n ready version (Highlight)"},{"id":"http://arxiv.org/abs/2404.05236v1","updated":"2024-04-08T07:01:42Z","published":"2024-04-08T07:01:42Z","title":"Stylizing Sparse-View 3D Scenes with Hierarchical Neural Representation","summary":" Recently, a surge of 3D style transfer methods has been proposed that\nleverage the scene reconstruction power of a pre-trained neural radiance field\n(NeRF). To successfully stylize a scene this way, one must first reconstruct a\nphoto-realistic radiance field from collected images of the scene. However,\nwhen only sparse input views are available, pre-trained few-shot NeRFs often\nsuffer from high-frequency artifacts, which are generated as a by-product of\nhigh-frequency details for improving reconstruction quality. Is it possible to\ngenerate more faithful stylized scenes from sparse inputs by directly\noptimizing encoding-based scene representation with target style? In this\npaper, we consider the stylization of sparse-view scenes in terms of\ndisentangling content semantics and style textures. We propose a coarse-to-fine\nsparse-view scene stylization framework, where a novel hierarchical\nencoding-based neural representation is designed to generate high-quality\nstylized scenes directly from implicit scene representations. We also propose a\nnew optimization strategy with content strength annealing to achieve realistic\nstylization and better content preservation. Extensive experiments demonstrate\nthat our method can achieve high-quality stylization of sparse-view scenes and\noutperforms fine-tuning-based baselines in terms of stylization quality and\nefficiency.\n","authors":["Y. Wang","A. Gao","Y. Gong","Y. Zeng"],"pdf_url":"https://arxiv.org/pdf/2404.05236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05231v1","updated":"2024-04-08T06:53:30Z","published":"2024-04-08T06:53:30Z","title":"PromptAD: Learning Prompts with only Normal Samples for Few-Shot Anomaly\n Detection","summary":" The vision-language model has brought great improvement to few-shot\nindustrial anomaly detection, which usually needs to design of hundreds of\nprompts through prompt engineering. For automated scenarios, we first use\nconventional prompt learning with many-class paradigm as the baseline to\nautomatically learn prompts but found that it can not work well in one-class\nanomaly detection. To address the above problem, this paper proposes a\none-class prompt learning method for few-shot anomaly detection, termed\nPromptAD. First, we propose semantic concatenation which can transpose normal\nprompts into anomaly prompts by concatenating normal prompts with anomaly\nsuffixes, thus constructing a large number of negative samples used to guide\nprompt learning in one-class setting. Furthermore, to mitigate the training\nchallenge caused by the absence of anomaly images, we introduce the concept of\nexplicit anomaly margin, which is used to explicitly control the margin between\nnormal prompt features and anomaly prompt features through a hyper-parameter.\nFor image-level/pixel-level anomaly detection, PromptAD achieves first place in\n11/12 few-shot settings on MVTec and VisA.\n","authors":["Xiaofan Li","Zhizhong Zhang","Xin Tan","Chengwei Chen","Yanyun Qu","Yuan Xie","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2404.05231v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.05225v1","updated":"2024-04-08T06:40:28Z","published":"2024-04-08T06:40:28Z","title":"LayoutLLM: Layout Instruction Tuning with Large Language Models for\n Document Understanding","summary":" Recently, leveraging large language models (LLMs) or multimodal large\nlanguage models (MLLMs) for document understanding has been proven very\npromising. However, previous works that employ LLMs/MLLMs for document\nunderstanding have not fully explored and utilized the document layout\ninformation, which is vital for precise document understanding. In this paper,\nwe propose LayoutLLM, an LLM/MLLM based method for document understanding. The\ncore of LayoutLLM is a layout instruction tuning strategy, which is specially\ndesigned to enhance the comprehension and utilization of document layouts. The\nproposed layout instruction tuning strategy consists of two components:\nLayout-aware Pre-training and Layout-aware Supervised Fine-tuning. To capture\nthe characteristics of document layout in Layout-aware Pre-training, three\ngroups of pre-training tasks, corresponding to document-level, region-level and\nsegment-level information, are introduced. Furthermore, a novel module called\nlayout chain-of-thought (LayoutCoT) is devised to enable LayoutLLM to focus on\nregions relevant to the question and generate accurate answers. LayoutCoT is\neffective for boosting the performance of document understanding. Meanwhile, it\nbrings a certain degree of interpretability, which could facilitate manual\ninspection and correction. Experiments on standard benchmarks show that the\nproposed LayoutLLM significantly outperforms existing methods that adopt\nopen-source 7B LLMs/MLLMs for document understanding. The training data of the\nLayoutLLM is publicly available at\nhttps://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/DocumentUnderstanding/LayoutLLM\n","authors":["Chuwei Luo","Yufan Shen","Zhaoqing Zhu","Qi Zheng","Zhi Yu","Cong Yao"],"pdf_url":"https://arxiv.org/pdf/2404.05225v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05220v1","updated":"2024-04-08T06:32:11Z","published":"2024-04-08T06:32:11Z","title":"StylizedGS: Controllable Stylization for 3D Gaussian Splatting","summary":" With the rapid development of XR, 3D generation and editing are becoming more\nand more important, among which, stylization is an important tool of 3D\nappearance editing. It can achieve consistent 3D artistic stylization given a\nsingle reference style image and thus is a user-friendly editing way. However,\nrecent NeRF-based 3D stylization methods face efficiency issues that affect the\nactual user experience and the implicit nature limits its ability to transfer\nthe geometric pattern styles. Additionally, the ability for artists to exert\nflexible control over stylized scenes is considered highly desirable, fostering\nan environment conducive to creative exploration. In this paper, we introduce\nStylizedGS, a 3D neural style transfer framework with adaptable control over\nperceptual factors based on 3D Gaussian Splatting (3DGS) representation. The\n3DGS brings the benefits of high efficiency. We propose a GS filter to\neliminate floaters in the reconstruction which affects the stylization effects\nbefore stylization. Then the nearest neighbor-based style loss is introduced to\nachieve stylization by fine-tuning the geometry and color parameters of 3DGS,\nwhile a depth preservation loss with other regularizations is proposed to\nprevent the tampering of geometry content. Moreover, facilitated by specially\ndesigned losses, StylizedGS enables users to control color, stylized scale and\nregions during the stylization to possess customized capabilities. Our method\ncan attain high-quality stylization results characterized by faithful\nbrushstrokes and geometric consistency with flexible controls. Extensive\nexperiments across various scenes and styles demonstrate the effectiveness and\nefficiency of our method concerning both stylization quality and inference FPS.\n","authors":["Dingxi Zhang","Zhuoxun Chen","Yu-Jie Yuan","Fang-Lue Zhang","Zhenliang He","Shiguang Shan","Lin Gao"],"pdf_url":"https://arxiv.org/pdf/2404.05220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05773v2","updated":"2024-04-08T06:28:13Z","published":"2024-02-08T16:00:25Z","title":"UAV-Rain1k: A Benchmark for Raindrop Removal from UAV Aerial Imagery","summary":" Raindrops adhering to the lens of UAVs can obstruct visibility of the\nbackground scene and degrade image quality. Despite recent progress in image\nderaining methods and datasets, there is a lack of focus on raindrop removal\nfrom UAV aerial imagery due to the unique challenges posed by varying angles\nand rapid movement during drone flight. To fill the gap in this research, we\nfirst construct a new benchmark dataset for removing raindrops from UAV images,\ncalled UAV-Rain1k. In this letter, we provide a dataset generation pipeline,\nwhich includes modeling raindrop shapes using Blender, collecting background\nimages from various UAV angles, random sampling of rain masks and etc. Based on\nthe proposed benchmark, we further present a comprehensive evaluation of\nexisting representative image deraining algorithms, and reveal future research\nopportunities worth exploring. The proposed dataset is publicly available at\nhttps://github.com/cschenxiang/UAV-Rain1k.\n","authors":["Wenhui Chang","Hongming Chen","Xin He","Xiang Chen","Liangduo Shen"],"pdf_url":"https://arxiv.org/pdf/2402.05773v2.pdf","comment":"Accepted by IEEE/CVF Conference on Computer Vision and Pattern\n Recognition Workshops (CVPRW) 2024"},{"id":"http://arxiv.org/abs/2312.17118v3","updated":"2024-04-08T06:23:12Z","published":"2023-12-28T16:54:53Z","title":"Fully Sparse 3D Occupancy Prediction","summary":" Occupancy prediction plays a pivotal role in autonomous driving. Previous\nmethods typically construct dense 3D volumes, neglecting the inherent sparsity\nof the scene and suffering high computational costs. To bridge the gap, we\nintroduce a novel fully sparse occupancy network, termed SparseOcc. SparseOcc\ninitially reconstructs a sparse 3D representation from visual inputs and\nsubsequently predicts semantic/instance occupancy from the 3D sparse\nrepresentation by sparse queries. A mask-guided sparse sampling is designed to\nenable sparse queries to interact with 2D features in a fully sparse manner,\nthereby circumventing costly dense features or global attention. Additionally,\nwe design a thoughtful ray-based evaluation metric, namely RayIoU, to solve the\ninconsistency penalty along depths raised in traditional voxel-level mIoU\ncriteria. SparseOcc demonstrates its effectiveness by achieving a RayIoU of\n34.0, while maintaining a real-time inference speed of 17.3 FPS, with 7 history\nframes inputs. By incorporating more preceding frames to 15, SparseOcc\ncontinuously improves its performance to 35.1 RayIoU without whistles and\nbells. Code is available at https://github.com/MCG-NJU/SparseOcc.\n","authors":["Haisong Liu","Yang Chen","Haiguang Wang","Zetong Yang","Tianyu Li","Jia Zeng","Li Chen","Hongyang Li","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.17118v3.pdf","comment":"Add new metric: RayIoU"},{"id":"http://arxiv.org/abs/2404.05218v1","updated":"2024-04-08T06:15:13Z","published":"2024-04-08T06:15:13Z","title":"Multi-agent Long-term 3D Human Pose Forecasting via Interaction-aware\n Trajectory Conditioning","summary":" Human pose forecasting garners attention for its diverse applications.\nHowever, challenges in modeling the multi-modal nature of human motion and\nintricate interactions among agents persist, particularly with longer\ntimescales and more agents. In this paper, we propose an interaction-aware\ntrajectory-conditioned long-term multi-agent human pose forecasting model,\nutilizing a coarse-to-fine prediction approach: multi-modal global trajectories\nare initially forecasted, followed by respective local pose forecasts\nconditioned on each mode. In doing so, our Trajectory2Pose model introduces a\ngraph-based agent-wise interaction module for a reciprocal forecast of local\nmotion-conditioned global trajectory and trajectory-conditioned local pose. Our\nmodel effectively handles the multi-modality of human motion and the complexity\nof long-term multi-agent interactions, improving performance in complex\nenvironments. Furthermore, we address the lack of long-term (6s+) multi-agent\n(5+) datasets by constructing a new dataset from real-world images and 2D\nannotations, enabling a comprehensive evaluation of our proposed model.\nState-of-the-art prediction performance on both complex and simpler datasets\nconfirms the generalized effectiveness of our method. The code is available at\nhttps://github.com/Jaewoo97/T2P.\n","authors":["Jaewoo Jeong","Daehee Park","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2404.05218v1.pdf","comment":"2024 CVPR Highlight"},{"id":"http://arxiv.org/abs/2404.02135v3","updated":"2024-04-08T06:11:48Z","published":"2024-04-02T17:48:46Z","title":"Enhancing Ship Classification in Optical Satellite Imagery: Integrating\n Convolutional Block Attention Module with ResNet for Improved Performance","summary":" This study presents an advanced Convolutional Neural Network (CNN)\narchitecture for ship classification from optical satellite imagery,\nsignificantly enhancing performance through the integration of the\nConvolutional Block Attention Module (CBAM) and additional architectural\ninnovations. Building upon the foundational ResNet50 model, we first\nincorporated a standard CBAM to direct the model's focus towards more\ninformative features, achieving an accuracy of 87% compared to the baseline\nResNet50's 85%. Further augmentations involved multi-scale feature integration,\ndepthwise separable convolutions, and dilated convolutions, culminating in the\nEnhanced ResNet Model with Improved CBAM. This model demonstrated a remarkable\naccuracy of 95%, with precision, recall, and f1-scores all witnessing\nsubstantial improvements across various ship classes. The bulk carrier and oil\ntanker classes, in particular, showcased nearly perfect precision and recall\nrates, underscoring the model's enhanced capability in accurately identifying\nand classifying ships. Attention heatmap analyses further validated the\nimproved model's efficacy, revealing a more focused attention on relevant ship\nfeatures, regardless of background complexities. These findings underscore the\npotential of integrating attention mechanisms and architectural innovations in\nCNNs for high-resolution satellite imagery classification. The study navigates\nthrough the challenges of class imbalance and computational costs, proposing\nfuture directions towards scalability and adaptability in new or rare ship type\nrecognition. This research lays a groundwork for the application of advanced\ndeep learning techniques in the domain of remote sensing, offering insights\ninto scalable and efficient satellite image classification.\n","authors":["Ryan Donghan Kwon","Gangjoo Robin Nam","Jisoo Tak","Junseob Shin","Hyerin Cha","Yeom Hyeok","Seung Won Lee"],"pdf_url":"https://arxiv.org/pdf/2404.02135v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05215v1","updated":"2024-04-08T06:07:32Z","published":"2024-04-08T06:07:32Z","title":"Spatio-Temporal Attention and Gaussian Processes for Personalized Video\n Gaze Estimation","summary":" Gaze is an essential prompt for analyzing human behavior and attention.\nRecently, there has been an increasing interest in determining gaze direction\nfrom facial videos. However, video gaze estimation faces significant\nchallenges, such as understanding the dynamic evolution of gaze in video\nsequences, dealing with static backgrounds, and adapting to variations in\nillumination. To address these challenges, we propose a simple and novel deep\nlearning model designed to estimate gaze from videos, incorporating a\nspecialized attention module. Our method employs a spatial attention mechanism\nthat tracks spatial dynamics within videos. This technique enables accurate\ngaze direction prediction through a temporal sequence model, adeptly\ntransforming spatial observations into temporal insights, thereby significantly\nimproving gaze estimation accuracy. Additionally, our approach integrates\nGaussian processes to include individual-specific traits, facilitating the\npersonalization of our model with just a few labeled samples. Experimental\nresults confirm the efficacy of the proposed approach, demonstrating its\nsuccess in both within-dataset and cross-dataset settings. Specifically, our\nproposed approach achieves state-of-the-art performance on the Gaze360 dataset,\nimproving by $2.5^\\circ$ without personalization. Further, by personalizing the\nmodel with just three samples, we achieved an additional improvement of\n$0.8^\\circ$. The code and pre-trained models are available at\n\\url{https://github.com/jswati31/stage}.\n","authors":["Swati Jindal","Mohit Yadav","Roberto Manduchi"],"pdf_url":"https://arxiv.org/pdf/2404.05215v1.pdf","comment":"Accepted at CVPR 2024 Gaze workshop"},{"id":"http://arxiv.org/abs/2404.05212v1","updated":"2024-04-08T05:58:07Z","published":"2024-04-08T05:58:07Z","title":"DiffCJK: Conditional Diffusion Model for High-Quality and Wide-coverage\n CJK Character Generation","summary":" Chinese, Japanese, and Korean (CJK), with a vast number of native speakers,\nhas profound influence on society and culture. The typesetting of CJK languages\ncarries a wide range of requirements due to the complexity of their scripts and\nunique literary traditions. A critical aspect of this typesetting process is\nthat CJK fonts need to provide a set of consistent-looking glyphs for\napproximately one hundred thousand characters. However, creating such a font is\ninherently labor-intensive and expensive, which significantly hampers the\ndevelopment of new CJK fonts for typesetting, historical, aesthetic, or\nartistic purposes.\n To bridge this gap, we are motivated by recent advancements in\ndiffusion-based generative models and propose a novel diffusion method for\ngenerating glyphs in a targeted style from a \\emph{single} conditioned,\nstandard glyph form. Our experiments show that our method is capable of\ngenerating fonts of both printed and hand-written styles, the latter of which\npresents a greater challenge. Moreover, our approach shows remarkable zero-shot\ngeneralization capabilities for non-CJK but Chinese-inspired scripts. We also\nshow our method facilitates smooth style interpolation and generates bitmap\nimages suitable for vectorization, which is crucial in the font creation\nprocess. In summary, our proposed method opens the door to high-quality,\ngenerative model-assisted font creation for CJK characters, for both\ntypesetting and artistic endeavors.\n","authors":["Yingtao Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05211v1","updated":"2024-04-08T05:50:46Z","published":"2024-04-08T05:50:46Z","title":"Multi-level Graph Subspace Contrastive Learning for Hyperspectral Image\n Clustering","summary":" Hyperspectral image (HSI) clustering is a challenging task due to its high\ncomplexity. Despite subspace clustering shows impressive performance for HSI,\ntraditional methods tend to ignore the global-local interaction in HSI data. In\nthis study, we proposed a multi-level graph subspace contrastive learning\n(MLGSC) for HSI clustering. The model is divided into the following main parts.\nGraph convolution subspace construction: utilizing spectral and texture\nfeautures to construct two graph convolution views. Local-global graph\nrepresentation: local graph representations were obtained by step-by-step\nconvolutions and a more representative global graph representation was obtained\nusing an attention-based pooling strategy. Multi-level graph subspace\ncontrastive learning: multi-level contrastive learning was conducted to obtain\nlocal-global joint graph representations, to improve the consistency of the\npositive samples between views, and to obtain more robust graph embeddings.\nSpecifically, graph-level contrastive learning is used to better learn global\nrepresentations of HSI data. Node-level intra-view and inter-view contrastive\nlearning is designed to learn joint representations of local regions of HSI.\nThe proposed model is evaluated on four popular HSI datasets: Indian Pines,\nPavia University, Houston, and Xu Zhou. The overall accuracies are 97.75%,\n99.96%, 92.28%, and 95.73%, which significantly outperforms the current\nstate-of-the-art clustering methods.\n","authors":["Jingxin Wang","Renxiang Guan","Kainan Gao","Zihao Li","Hao Li","Xianju Li","Chang Tang"],"pdf_url":"https://arxiv.org/pdf/2404.05211v1.pdf","comment":"IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.05210v1","updated":"2024-04-08T05:45:03Z","published":"2024-04-08T05:45:03Z","title":"Bidirectional Long-Range Parser for Sequential Data Understanding","summary":" The transformer is a powerful data modelling framework responsible for\nremarkable performance on a wide range of tasks. However, they are limited in\nterms of scalability as it is suboptimal and inefficient to process\nlong-sequence data. To this purpose we introduce BLRP (Bidirectional Long-Range\nParser), a novel and versatile attention mechanism designed to increase\nperformance and efficiency on long-sequence tasks. It leverages short and long\nrange heuristics in the form of a local sliding window approach combined with a\nglobal bidirectional latent space synthesis technique. We show the benefits and\nversatility of our approach on vision and language domains by demonstrating\ncompetitive results against state-of-the-art methods on the Long-Range-Arena\nand CIFAR benchmarks together with ablations demonstrating the computational\nefficiency.\n","authors":["George Leotescu","Daniel Voinea","Alin-Ionut Popa"],"pdf_url":"https://arxiv.org/pdf/2404.05210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05207v1","updated":"2024-04-08T05:23:12Z","published":"2024-04-08T05:23:12Z","title":"iVPT: Improving Task-relevant Information Sharing in Visual Prompt\n Tuning by Cross-layer Dynamic Connection","summary":" Recent progress has shown great potential of visual prompt tuning (VPT) when\nadapting pre-trained vision transformers to various downstream tasks. However,\nmost existing solutions independently optimize prompts at each layer, thereby\nneglecting the usage of task-relevant information encoded in prompt tokens\nacross layers. Additionally, existing prompt structures are prone to\ninterference from task-irrelevant noise in input images, which can do harm to\nthe sharing of task-relevant information. In this paper, we propose a novel VPT\napproach, \\textbf{iVPT}. It innovatively incorporates a cross-layer dynamic\nconnection (CDC) for input prompt tokens from adjacent layers, enabling\neffective sharing of task-relevant information. Furthermore, we design a\ndynamic aggregation (DA) module that facilitates selective sharing of\ninformation between layers. The combination of CDC and DA enhances the\nflexibility of the attention process within the VPT framework. Building upon\nthese foundations, iVPT introduces an attentive reinforcement (AR) mechanism,\nby automatically identifying salient image tokens, which are further enhanced\nby prompt tokens in an additive manner. Extensive experiments on 24 image\nclassification and semantic segmentation benchmarks clearly demonstrate the\nadvantage of the proposed iVPT, compared to the state-of-the-art counterparts.\n","authors":["Nan Zhou","Jiaxin Chen","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2404.05207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05206v1","updated":"2024-04-08T05:19:28Z","published":"2024-04-08T05:19:28Z","title":"SoundingActions: Learning How Actions Sound from Narrated Egocentric\n Videos","summary":" We propose a novel self-supervised embedding to learn how actions sound from\nnarrated in-the-wild egocentric videos. Whereas existing methods rely on\ncurated data with known audio-visual correspondence, our multimodal\ncontrastive-consensus coding (MC3) embedding reinforces the associations\nbetween audio, language, and vision when all modality pairs agree, while\ndiminishing those associations when any one pair does not. We show our approach\ncan successfully discover how the long tail of human actions sound from\negocentric video, outperforming an array of recent multimodal embedding\ntechniques on two datasets (Ego4D and EPIC-Sounds) and multiple cross-modal\ntasks.\n","authors":["Changan Chen","Kumar Ashutosh","Rohit Girdhar","David Harwath","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2404.05206v1.pdf","comment":"Accepted at CVPR 2024. Project page:\n https://vision.cs.utexas.edu/projects/soundingactions"},{"id":"http://arxiv.org/abs/2404.05205v1","updated":"2024-04-08T05:18:39Z","published":"2024-04-08T05:18:39Z","title":"A secure and private ensemble matcher using multi-vault obfuscated\n templates","summary":" Given the irrevocability of biometric samples and mounting privacy concerns,\nbiometric template security and secure matching are among the essential\nfeatures of any well-designed modern biometric system. In this paper, we\npropose an obfuscation method that hides the biometric template information\nwith just enough chaff. The main idea is to reduce the number of chaff points\nto a practical level by creating n sub-templates from the original template and\nhiding each sub-template with m chaff points. During verification, s closest\nvectors to the biometric query are retrieved from each vault and then combined\nto generate hash values that are compared with the stored hash value. We\ndemonstrate the effectiveness of synthetic facial images, generated by a\nGenerative Adversarial Network (GAN), as ``random chaff points'' within a\nsecure-vault authorization system. This approach safeguards user identities\nduring training and deployment. We tested our protocol using the AT&T, GT, and\nLFW face datasets, with the ROC areas under the curve being 0.99, 0.99, and\n0.90, respectively. These numbers were close to those of the unprotected\ntemplates, showing that our method does not adversely affect accuracy.\n","authors":["Babak Poorebrahim Gilkalaye","Shubhabrata Mukherjee","Reza Derakhshani"],"pdf_url":"https://arxiv.org/pdf/2404.05205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11825v2","updated":"2024-04-08T05:11:47Z","published":"2023-11-20T15:03:56Z","title":"Holistic Inverse Rendering of Complex Facade via Aerial 3D Scanning","summary":" In this work, we use multi-view aerial images to reconstruct the geometry,\nlighting, and material of facades using neural signed distance fields (SDFs).\nWithout the requirement of complex equipment, our method only takes simple RGB\nimages captured by a drone as inputs to enable physically based and\nphotorealistic novel-view rendering, relighting, and editing. However, a\nreal-world facade usually has complex appearances ranging from diffuse rocks\nwith subtle details to large-area glass windows with specular reflections,\nmaking it hard to attend to everything. As a result, previous methods can\npreserve the geometry details but fail to reconstruct smooth glass windows or\nverse vise. In order to address this challenge, we introduce three spatial- and\nsemantic-adaptive optimization strategies, including a semantic regularization\napproach based on zero-shot segmentation techniques to improve material\nconsistency, a frequency-aware geometry regularization to balance surface\nsmoothness and details in different surfaces, and a visibility probe-based\nscheme to enable efficient modeling of the local lighting in large-scale\noutdoor environments. In addition, we capture a real-world facade aerial 3D\nscanning image set and corresponding point clouds for training and\nbenchmarking. The experiment demonstrates the superior quality of our method on\nfacade holistic inverse rendering, novel view synthesis, and scene editing\ncompared to state-of-the-art baselines.\n","authors":["Zixuan Xie","Rengan Xie","Rong Li","Kai Huang","Pengju Qiao","Jingsen Zhu","Xu Yin","Qi Ye","Wei Hua","Yuchi Huo","Hujun Bao"],"pdf_url":"https://arxiv.org/pdf/2311.11825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01518v3","updated":"2024-04-08T05:09:19Z","published":"2024-04-01T22:53:47Z","title":"Temporally Consistent Unbalanced Optimal Transport for Unsupervised\n Action Segmentation","summary":" We propose a novel approach to the action segmentation task for long,\nuntrimmed videos, based on solving an optimal transport problem. By encoding a\ntemporal consistency prior into a Gromov-Wasserstein problem, we are able to\ndecode a temporally consistent segmentation from a noisy affinity/matching cost\nmatrix between video frames and action classes. Unlike previous approaches, our\nmethod does not require knowing the action order for a video to attain temporal\nconsistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can\nbe efficiently solved on GPUs using a few iterations of projected mirror\ndescent. We demonstrate the effectiveness of our method in an unsupervised\nlearning setting, where our method is used to generate pseudo-labels for\nself-training. We evaluate our segmentation approach and unsupervised learning\npipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly\ndatasets, yielding state-of-the-art results for the unsupervised video action\nsegmentation task.\n","authors":["Ming Xu","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2404.01518v3.pdf","comment":"Accepted to CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2404.05196v1","updated":"2024-04-08T04:53:29Z","published":"2024-04-08T04:53:29Z","title":"HSViT: Horizontally Scalable Vision Transformer","summary":" While the Vision Transformer (ViT) architecture gains prominence in computer\nvision and attracts significant attention from multimedia communities, its\ndeficiency in prior knowledge (inductive bias) regarding shift, scale, and\nrotational invariance necessitates pre-training on large-scale datasets.\nFurthermore, the growing layers and parameters in both ViT and convolutional\nneural networks (CNNs) impede their applicability to mobile multimedia\nservices, primarily owing to the constrained computational resources on edge\ndevices. To mitigate the aforementioned challenges, this paper introduces a\nnovel horizontally scalable vision transformer (HSViT). Specifically, a novel\nimage-level feature embedding allows ViT to better leverage the inductive bias\ninherent in the convolutional layers. Based on this, an innovative horizontally\nscalable architecture is designed, which reduces the number of layers and\nparameters of the models while facilitating collaborative training and\ninference of ViT models across multiple nodes. The experimental results depict\nthat, without pre-training on large-scale datasets, HSViT achieves up to 10%\nhigher top-1 accuracy than state-of-the-art schemes, ascertaining its superior\npreservation of inductive bias. The code is available at\nhttps://github.com/xuchenhao001/HSViT.\n","authors":["Chenhao Xu","Chang-Tsun Li","Chee Peng Lim","Douglas Creighton"],"pdf_url":"https://arxiv.org/pdf/2404.05196v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05187v1","updated":"2024-04-08T04:27:36Z","published":"2024-04-08T04:27:36Z","title":"LGSDF: Continual Global Learning of Signed Distance Fields Aided by\n Local Updating","summary":" Implicit reconstruction of ESDF (Euclidean Signed Distance Field) involves\ntraining a neural network to regress the signed distance from any point to the\nnearest obstacle, which has the advantages of lightweight storage and\ncontinuous querying. However, existing algorithms usually rely on conflicting\nraw observations as training data, resulting in poor map performance. In this\npaper, we propose LGSDF, an ESDF continual Global learning algorithm aided by\nLocal updating. At the front end, axis-aligned grids are dynamically updated by\npre-processed sensor observations, where incremental fusion alleviates\nestimation error caused by limited viewing directions. At the back end, a\nrandomly initialized implicit ESDF neural network performs continual\nself-supervised learning guided by these grids to generate smooth and\ncontinuous maps. The results on multiple scenes show that LGSDF can construct\nmore accurate ESDF maps and meshes compared with SOTA (State Of The Art)\nexplicit and implicit mapping algorithms. The source code of LGSDF is publicly\navailable at https://github.com/BIT-DYN/LGSDF.\n","authors":["Yufeng Yue","Yinan Deng","Jiahui Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2404.05187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05183v1","updated":"2024-04-08T04:17:27Z","published":"2024-04-08T04:17:27Z","title":"Progressive Alignment with VLM-LLM Feature to Augment Defect\n Classification for the ASE Dataset","summary":" Traditional defect classification approaches are facing with two barriers.\n(1) Insufficient training data and unstable data quality. Collecting sufficient\ndefective sample is expensive and time-costing, consequently leading to dataset\nvariance. It introduces the difficulty on recognition and learning. (2)\nOver-dependence on visual modality. When the image pattern and texture is\nmonotonic for all defect classes in a given dataset, the performance of\nconventional AOI system cannot be guaranteed. In scenarios where image quality\nis compromised due to mechanical failures or when defect information is\ninherently difficult to discern, the performance of deep models cannot be\nguaranteed. A main question is, \"how to solve those two problems when they\noccur at the same time?\" The feasible strategy is to explore another feature\nwithin dataset and combine an eminent vision-language model (VLM) and\nLarge-Language model (LLM) with their astonishing zero-shot capability. In this\nwork, we propose the special ASE dataset, including rich data description\nrecorded on image, for defect classification, but the defect feature is uneasy\nto learn directly. Secondly, We present the prompting for VLM-LLM against\ndefect classification with the proposed ASE dataset to activate extra-modality\nfeature from images to enhance performance. Then, We design the novel\nprogressive feature alignment (PFA) block to refine image-text feature to\nalleviate the difficulty of alignment under few-shot scenario. Finally, the\nproposed Cross-modality attention fusion (CMAF) module can effectively fuse\ndifferent modality feature. Experiment results have demonstrated our method's\neffectiveness over several defect classification methods for the ASE dataset.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Chun-Hung Sun","Kuang-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2404.05183v1.pdf","comment":"MULA 2024"},{"id":"http://arxiv.org/abs/2404.05181v1","updated":"2024-04-08T04:13:35Z","published":"2024-04-08T04:13:35Z","title":"Adaptive Learning for Multi-view Stereo Reconstruction","summary":" Deep learning has recently demonstrated its excellent performance on the task\nof multi-view stereo (MVS). However, loss functions applied for deep MVS are\nrarely studied. In this paper, we first analyze existing loss functions'\nproperties for deep depth based MVS approaches. Regression based loss leads to\ninaccurate continuous results by computing mathematical expectation, while\nclassification based loss outputs discretized depth values. To this end, we\nthen propose a novel loss function, named adaptive Wasserstein loss, which is\nable to narrow down the difference between the true and predicted probability\ndistributions of depth. Besides, a simple but effective offset module is\nintroduced to better achieve sub-pixel prediction accuracy. Extensive\nexperiments on different benchmarks, including DTU, Tanks and Temples and\nBlendedMVS, show that the proposed method with the adaptive Wasserstein loss\nand the offset module achieves state-of-the-art performance.\n","authors":["Qinglu Min","Jie Zhao","Zhihao Zhang","Chen Min"],"pdf_url":"https://arxiv.org/pdf/2404.05181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05180v1","updated":"2024-04-08T04:10:50Z","published":"2024-04-08T04:10:50Z","title":"GloSoFarID: Global multispectral dataset for Solar Farm IDentification\n in satellite imagery","summary":" Solar Photovoltaic (PV) technology is increasingly recognized as a pivotal\nsolution in the global pursuit of clean and renewable energy. This technology\naddresses the urgent need for sustainable energy alternatives by converting\nsolar power into electricity without greenhouse gas emissions. It not only\ncurtails global carbon emissions but also reduces reliance on finite,\nnon-renewable energy sources. In this context, monitoring solar panel farms\nbecomes essential for understanding and facilitating the worldwide shift toward\nclean energy. This study contributes to this effort by developing the first\ncomprehensive global dataset of multispectral satellite imagery of solar panel\nfarms. This dataset is intended to form the basis for training robust machine\nlearning models, which can accurately map and analyze the expansion and\ndistribution of solar panel farms globally. The insights gained from this\nendeavor will be instrumental in guiding informed decision-making for a\nsustainable energy future. https://github.com/yzyly1992/GloSoFarID\n","authors":["Zhiyuan Yang","Ryan Rad"],"pdf_url":"https://arxiv.org/pdf/2404.05180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05169v1","updated":"2024-04-08T03:33:01Z","published":"2024-04-08T03:33:01Z","title":"QMix: Quality-aware Learning with Mixed Noise for Robust Retinal Disease\n Diagnosis","summary":" Due to the complexity of medical image acquisition and the difficulty of\nannotation, medical image datasets inevitably contain noise. Noisy data with\nwrong labels affects the robustness and generalization ability of deep neural\nnetworks. Previous noise learning methods mainly considered noise arising from\nimages being mislabeled, i.e. label noise, assuming that all mislabeled images\nare of high image quality. However, medical images are prone to suffering\nextreme quality issues, i.e. data noise, where discriminative visual features\nare missing for disease diagnosis. In this paper, we propose a noise learning\nframework, termed as QMix, that learns a robust disease diagnosis model under\nmixed noise. QMix alternates between sample separation and quality-aware\nsemisupervised training in each training epoch. In the sample separation phase,\nwe design a joint uncertainty-loss criterion to effectively separate (1)\ncorrectly labeled images; (2) mislabeled images with high quality and (3)\nmislabeled images with low quality. In the semi-supervised training phase, we\ntrain a disease diagnosis model to learn robust feature representation from the\nseparated samples. Specifically, we devise a sample-reweighing loss to mitigate\nthe effect of mislabeled images with low quality during training. Meanwhile, a\ncontrastive enhancement loss is proposed to further distinguish mislabeled\nimages with low quality from correctly labeled images. QMix achieved\nstate-of-the-art disease diagnosis performance on five public retinal image\ndatasets and exhibited substantial improvement on robustness against mixed\nnoise.\n","authors":["Junlin Hou","Jilan Xu","Rui Feng","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05163v1","updated":"2024-04-08T03:06:19Z","published":"2024-04-08T03:06:19Z","title":"Semantic Flow: Learning Semantic Field of Dynamic Scenes from Monocular\n Videos","summary":" In this work, we pioneer Semantic Flow, a neural semantic representation of\ndynamic scenes from monocular videos. In contrast to previous NeRF methods that\nreconstruct dynamic scenes from the colors and volume densities of individual\npoints, Semantic Flow learns semantics from continuous flows that contain rich\n3D motion information. As there is 2D-to-3D ambiguity problem in the viewing\ndirection when extracting 3D flow features from 2D video frames, we consider\nthe volume densities as opacity priors that describe the contributions of flow\nfeatures to the semantics on the frames. More specifically, we first learn a\nflow network to predict flows in the dynamic scene, and propose a flow feature\naggregation module to extract flow features from video frames. Then, we propose\na flow attention module to extract motion information from flow features, which\nis followed by a semantic network to output semantic logits of flows. We\nintegrate the logits with volume densities in the viewing direction to\nsupervise the flow features with semantic labels on video frames. Experimental\nresults show that our model is able to learn from multiple dynamic scenes and\nsupports a series of new tasks such as instance-level scene editing, semantic\ncompletions, dynamic scene tracking and semantic adaption on novel scenes.\nCodes are available at https://github.com/tianfr/Semantic-Flow/.\n","authors":["Fengrui Tian","Yueqi Duan","Angtian Wang","Jianfei Guo","Shaoyi Du"],"pdf_url":"https://arxiv.org/pdf/2404.05163v1.pdf","comment":"Accepted by ICLR 2024, Codes are available at\n https://github.com/tianfr/Semantic-Flow/"},{"id":"http://arxiv.org/abs/2311.08393v3","updated":"2024-04-08T02:57:55Z","published":"2023-11-14T18:53:28Z","title":"MVSA-Net: Multi-View State-Action Recognition for Robust and Deployable\n Trajectory Generation","summary":" The learn-from-observation (LfO) paradigm is a human-inspired mode for a\nrobot to learn to perform a task simply by watching it being performed. LfO can\nfacilitate robot integration on factory floors by minimizing disruption and\nreducing tedious programming. A key component of the LfO pipeline is a\ntransformation of the depth camera frames to the corresponding task state and\naction pairs, which are then relayed to learning techniques such as imitation\nor inverse reinforcement learning for understanding the task parameters. While\nseveral existing computer vision models analyze videos for activity\nrecognition, SA-Net specifically targets robotic LfO from RGB-D data. However,\nSA-Net and many other models analyze frame data captured from a single\nviewpoint. Their analysis is therefore highly sensitive to occlusions of the\nobserved task, which are frequent in deployments. An obvious way of reducing\nocclusions is to simultaneously observe the task from multiple viewpoints and\nsynchronously fuse the multiple streams in the model. Toward this, we present\nmulti-view SA-Net, which generalizes the SA-Net model to allow the perception\nof multiple viewpoints of the task activity, integrate them, and better\nrecognize the state and action in each frame. Performance evaluations on two\ndistinct domains establish that MVSA-Net recognizes the state-action pairs\nunder occlusion more accurately compared to single-view MVSA-Net and other\nbaselines. Our ablation studies further evaluate its performance under\ndifferent ambient conditions and establish the contribution of the architecture\ncomponents. As such, MVSA-Net offers a significantly more robust and deployable\nstate-action trajectory generation compared to previous methods.\n","authors":["Ehsan Asali","Prashant Doshi","Jin Sun"],"pdf_url":"https://arxiv.org/pdf/2311.08393v3.pdf","comment":"Presented at Deployable AI Workshop at AAAI-2024 and 'Towards\n Reliable and Deployable Learning-Based Robotic Systems' Workshop at CoRL2023"},{"id":"http://arxiv.org/abs/2403.05805v2","updated":"2024-04-08T02:47:54Z","published":"2024-03-09T05:50:32Z","title":"And Then the Hammer Broke: Reflections on Machine Ethics from Feminist\n Philosophy of Science","summary":" Vision is an important metaphor in ethical and political questions of\nknowledge. The feminist philosopher Donna Haraway points out the ``perverse''\nnature of an intrusive, alienating, all-seeing vision (to which we might cry\nout ``stop looking at me!''), but also encourages us to embrace the embodied\nnature of sight and its promises for genuinely situated knowledge. Current\ntechnologies of machine vision -- surveillance cameras, drones (for war or\nrecreation), iPhone cameras -- are usually construed as instances of the former\nrather than the latter, and for good reasons. However, although in no way\nattempting to diminish the real suffering these technologies have brought about\nin the world, I make the case for understanding technologies of computer vision\nas material instances of embodied seeing and situated knowing. Furthermore,\nborrowing from Iris Murdoch's concept of moral vision, I suggest that these\ntechnologies direct our labor towards self-reflection in ethically significant\nways. My approach draws upon paradigms in computer vision research,\nphenomenology, and feminist epistemology. Ultimately, this essay is an argument\nfor directing more philosophical attention from merely criticizing technologies\nof vision as ethically deficient towards embracing them as complex,\nmethodologically and epistemologically important objects.\n","authors":["Andre Ye"],"pdf_url":"https://arxiv.org/pdf/2403.05805v2.pdf","comment":"Pacific University Philosophy Conference"},{"id":"http://arxiv.org/abs/2403.03954v3","updated":"2024-04-08T02:46:38Z","published":"2024-03-06T18:58:49Z","title":"3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple\n 3D Representations","summary":" Imitation learning provides an efficient way to teach robots dexterous\nskills; however, learning complex skills robustly and generalizablely usually\nconsumes large amounts of human demonstrations. To tackle this challenging\nproblem, we present 3D Diffusion Policy (DP3), a novel visual imitation\nlearning approach that incorporates the power of 3D visual representations into\ndiffusion policies, a class of conditional action generative models. The core\ndesign of DP3 is the utilization of a compact 3D visual representation,\nextracted from sparse point clouds with an efficient point encoder. In our\nexperiments involving 72 simulation tasks, DP3 successfully handles most tasks\nwith just 10 demonstrations and surpasses baselines with a 24.2% relative\nimprovement. In 4 real robot tasks, DP3 demonstrates precise control with a\nhigh success rate of 85%, given only 40 demonstrations of each task, and shows\nexcellent generalization abilities in diverse aspects, including space,\nviewpoint, appearance, and instance. Interestingly, in real robot experiments,\nDP3 rarely violates safety requirements, in contrast to baseline methods which\nfrequently do, necessitating human intervention. Our extensive evaluation\nhighlights the critical importance of 3D representations in real-world robot\nlearning. Videos, code, and data are available on\nhttps://3d-diffusion-policy.github.io .\n","authors":["Yanjie Ze","Gu Zhang","Kangning Zhang","Chenyuan Hu","Muhan Wang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2403.03954v3.pdf","comment":"Videos, code, and data: https://3d-diffusion-policy.github.io"},{"id":"http://arxiv.org/abs/2404.00989v2","updated":"2024-04-08T02:37:25Z","published":"2024-04-01T08:34:42Z","title":"360+x: A Panoptic Multi-modal Scene Understanding Dataset","summary":" Human perception of the world is shaped by a multitude of viewpoints and\nmodalities. While many existing datasets focus on scene understanding from a\ncertain perspective (e.g. egocentric or third-person views), our dataset offers\na panoptic perspective (i.e. multiple viewpoints with multiple data\nmodalities). Specifically, we encapsulate third-person panoramic and front\nviews, as well as egocentric monocular/binocular views with rich modalities\nincluding video, multi-channel audio, directional binaural delay, location data\nand textual scene descriptions within each scene captured, presenting\ncomprehensive observation of the world. Figure 1 offers a glimpse of all 28\nscene categories of our 360+x dataset. To the best of our knowledge, this is\nthe first database that covers multiple viewpoints with multiple data\nmodalities to mimic how daily information is accessed in the real world.\nThrough our benchmark analysis, we presented 5 different scene understanding\ntasks on the proposed 360+x dataset to evaluate the impact and benefit of each\ndata modality and perspective in panoptic scene understanding. We hope this\nunique dataset could broaden the scope of comprehensive scene understanding and\nencourage the community to approach these problems from more diverse\nperspectives.\n","authors":["Hao Chen","Yuqi Hou","Chenyuan Qu","Irene Testini","Xiaohan Hong","Jianbo Jiao"],"pdf_url":"https://arxiv.org/pdf/2404.00989v2.pdf","comment":"CVPR 2024 (Oral Presentation), Project page:\n https://x360dataset.github.io/"},{"id":"http://arxiv.org/abs/2402.07819v2","updated":"2024-04-08T02:36:23Z","published":"2024-02-12T17:24:35Z","title":"A Benchmark Grocery Dataset of Realworld Point Clouds From Single View","summary":" Fine-grained grocery object recognition is an important computer vision\nproblem with broad applications in automatic checkout, in-store robotic\nnavigation, and assistive technologies for the visually impaired. Existing\ndatasets on groceries are mainly 2D images. Models trained on these datasets\nare limited to learning features from the regular 2D grids. While portable 3D\nsensors such as Kinect were commonly available for mobile phones, sensors such\nas LiDAR and TrueDepth, have recently been integrated into mobile phones.\nDespite the availability of mobile 3D sensors, there are currently no dedicated\nreal-world large-scale benchmark 3D datasets for grocery. In addition, existing\n3D datasets lack fine-grained grocery categories and have limited training\nsamples. Furthermore, collecting data by going around the object versus the\ntraditional photo capture makes data collection cumbersome. Thus, we introduce\na large-scale grocery dataset called 3DGrocery100. It constitutes 100 classes,\nwith a total of 87,898 3D point clouds created from 10,755 RGB-D single-view\nimages. We benchmark our dataset on six recent state-of-the-art 3D point cloud\nclassification models. Additionally, we also benchmark the dataset on few-shot\nand continual learning point cloud classification tasks. Project Page:\nhttps://bigdatavision.org/3DGrocery100/.\n","authors":["Shivanand Venkanna Sheshappanavar","Tejas Anvekar","Shivanand Kundargi","Yufan Wang","Chandra Kambhamettu"],"pdf_url":"https://arxiv.org/pdf/2402.07819v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02241v2","updated":"2024-04-08T02:06:37Z","published":"2024-04-02T18:59:39Z","title":"Linear Combination of Saved Checkpoints Makes Consistency and Diffusion\n Models Better","summary":" Diffusion Models (DM) and Consistency Models (CM) are two types of popular\ngenerative models with good generation quality on various tasks. When training\nDM and CM, intermediate weight checkpoints are not fully utilized and only the\nlast converged checkpoint is used. In this work, we find that high-quality\nmodel weights often lie in a basin which cannot be reached by SGD but can be\nobtained by proper checkpoint averaging. Based on these observations, we\npropose LCSC, a simple but effective and efficient method to enhance the\nperformance of DM and CM, by combining checkpoints along the training\ntrajectory with coefficients deduced from evolutionary search. We demonstrate\nthe value of LCSC through two use cases: $\\textbf{(a) Reducing training cost.}$\nWith LCSC, we only need to train DM/CM with fewer number of iterations and/or\nlower batch sizes to obtain comparable sample quality with the fully trained\nmodel. For example, LCSC achieves considerable training speedups for CM\n(23$\\times$ on CIFAR-10 and 15$\\times$ on ImageNet-64). $\\textbf{(b) Enhancing\npre-trained models.}$ Assuming full training is already done, LCSC can further\nimprove the generation quality or speed of the final converged models. For\nexample, LCSC achieves better performance using 1 number of function evaluation\n(NFE) than the base model with 2 NFE on consistency distillation, and decreases\nthe NFE of DM from 15 to 9 while maintaining the generation quality on\nCIFAR-10. Our code is available at\nhttps://github.com/imagination-research/LCSC.\n","authors":["Enshu Liu","Junyi Zhu","Zinan Lin","Xuefei Ning","Matthew B. Blaschko","Sergey Yekhanin","Shengen Yan","Guohao Dai","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02241v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05145v1","updated":"2024-04-08T02:02:15Z","published":"2024-04-08T02:02:15Z","title":"UniMix: Towards Domain Adaptive and Generalizable LiDAR Semantic\n Segmentation in Adverse Weather","summary":" LiDAR semantic segmentation (LSS) is a critical task in autonomous driving\nand has achieved promising progress. However, prior LSS methods are\nconventionally investigated and evaluated on datasets within the same domain in\nclear weather. The robustness of LSS models in unseen scenes and all weather\nconditions is crucial for ensuring safety and reliability in real applications.\nTo this end, we propose UniMix, a universal method that enhances the\nadaptability and generalizability of LSS models. UniMix first leverages\nphysically valid adverse weather simulation to construct a Bridge Domain, which\nserves to bridge the domain gap between the clear weather scenes and the\nadverse weather scenes. Then, a Universal Mixing operator is defined regarding\nspatial, intensity, and semantic distributions to create the intermediate\ndomain with mixed samples from given domains. Integrating the proposed two\ntechniques into a teacher-student framework, UniMix efficiently mitigates the\ndomain gap and enables LSS models to learn weather-robust and domain-invariant\nrepresentations. We devote UniMix to two main setups: 1) unsupervised domain\nadaption, adapting the model from the clear weather source domain to the\nadverse weather target domain; 2) domain generalization, learning a model that\ngeneralizes well to unseen scenes in adverse weather. Extensive experiments\nvalidate the effectiveness of UniMix across different tasks and datasets, all\nachieving superior performance over state-of-the-art methods. The code will be\nreleased.\n","authors":["Haimei Zhao","Jing Zhang","Zhuo Chen","Shanshan Zhao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2404.05145v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05144v1","updated":"2024-04-08T01:55:28Z","published":"2024-04-08T01:55:28Z","title":"Enhancing Clinical Efficiency through LLM: Discharge Note Generation for\n Cardiac Patients","summary":" Medical documentation, including discharge notes, is crucial for ensuring\npatient care quality, continuity, and effective medical communication. However,\nthe manual creation of these documents is not only time-consuming but also\nprone to inconsistencies and potential errors. The automation of this\ndocumentation process using artificial intelligence (AI) represents a promising\narea of innovation in healthcare. This study directly addresses the\ninefficiencies and inaccuracies in creating discharge notes manually,\nparticularly for cardiac patients, by employing AI techniques, specifically\nlarge language model (LLM). Utilizing a substantial dataset from a cardiology\ncenter, encompassing wide-ranging medical records and physician assessments,\nour research evaluates the capability of LLM to enhance the documentation\nprocess. Among the various models assessed, Mistral-7B distinguished itself by\naccurately generating discharge notes that significantly improve both\ndocumentation efficiency and the continuity of care for patients. These notes\nunderwent rigorous qualitative evaluation by medical expert, receiving high\nmarks for their clinical relevance, completeness, readability, and contribution\nto informed decision-making and care planning. Coupled with quantitative\nanalyses, these results confirm Mistral-7B's efficacy in distilling complex\nmedical information into concise, coherent summaries. Overall, our findings\nilluminate the considerable promise of specialized LLM, such as Mistral-7B, in\nrefining healthcare documentation workflows and advancing patient care. This\nstudy lays the groundwork for further integrating advanced AI technologies in\nhealthcare, demonstrating their potential to revolutionize patient\ndocumentation and support better care outcomes.\n","authors":["HyoJe Jung","Yunha Kim","Heejung Choi","Hyeram Seo","Minkyoung Kim","JiYe Han","Gaeun Kee","Seohyun Park","Soyoung Ko","Byeolhee Kim","Suyeon Kim","Tae Joon Jun","Young-Hak Kim"],"pdf_url":"https://arxiv.org/pdf/2404.05144v1.pdf","comment":"10 pages, 1 figure, 3 tables, conference"},{"id":"http://arxiv.org/abs/2404.05139v1","updated":"2024-04-08T01:38:43Z","published":"2024-04-08T01:38:43Z","title":"Better Monocular 3D Detectors with LiDAR from the Past","summary":" Accurate 3D object detection is crucial to autonomous driving. Though\nLiDAR-based detectors have achieved impressive performance, the high cost of\nLiDAR sensors precludes their widespread adoption in affordable vehicles.\nCamera-based detectors are cheaper alternatives but often suffer inferior\nperformance compared to their LiDAR-based counterparts due to inherent depth\nambiguities in images. In this work, we seek to improve monocular 3D detectors\nby leveraging unlabeled historical LiDAR data. Specifically, at inference time,\nwe assume that the camera-based detectors have access to multiple unlabeled\nLiDAR scans from past traversals at locations of interest (potentially from\nother high-end vehicles equipped with LiDAR sensors). Under this setup, we\nproposed a novel, simple, and end-to-end trainable framework, termed\nAsyncDepth, to effectively extract relevant features from asynchronous LiDAR\ntraversals of the same location for monocular 3D detectors. We show consistent\nand significant performance gain (up to 9 AP) across multiple state-of-the-art\nmodels and datasets with a negligible additional latency of 9.66 ms and a small\nstorage cost.\n","authors":["Yurong You","Cheng Perng Phoo","Carlos Andres Diaz-Ruiz","Katie Z Luo","Wei-Lun Chao","Mark Campbell","Bharath Hariharan","Kilian Q Weinberger"],"pdf_url":"https://arxiv.org/pdf/2404.05139v1.pdf","comment":"Accepted by ICRA 2022. The code can be found at\n https://github.com/YurongYou/AsyncDepth"},{"id":"http://arxiv.org/abs/2404.05136v1","updated":"2024-04-08T01:29:10Z","published":"2024-04-08T01:29:10Z","title":"Self-Supervised Multi-Object Tracking with Path Consistency","summary":" In this paper, we propose a novel concept of path consistency to learn robust\nobject matching without using manual object identity supervision. Our key idea\nis that, to track a object through frames, we can obtain multiple different\nassociation results from a model by varying the frames it can observe, i.e.,\nskipping frames in observation. As the differences in observations do not alter\nthe identities of objects, the obtained association results should be\nconsistent. Based on this rationale, we generate multiple observation paths,\neach specifying a different set of frames to be skipped, and formulate the Path\nConsistency Loss that enforces the association results are consistent across\ndifferent observation paths. We use the proposed loss to train our object\nmatching model with only self-supervision. By extensive experiments on three\ntracking datasets (MOT17, PersonPath22, KITTI), we demonstrate that our method\noutperforms existing unsupervised methods with consistent margins on various\nevaluation metrics, and even achieves performance close to supervised methods.\n","authors":["Zijia Lu","Bing Shuai","Yanbei Chen","Zhenlin Xu","Davide Modolo"],"pdf_url":"https://arxiv.org/pdf/2404.05136v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05129v1","updated":"2024-04-08T01:14:09Z","published":"2024-04-08T01:14:09Z","title":"Image-based Agarwood Resinous Area Segmentation using Deep Learning","summary":" The manual extraction method of Agarwood resinous compound is laborious work,\nrequires skilled workers, and is subject to human errors. Commercial Agarwood\nindustries have been actively exploring using Computer Numerical Control (CNC)\nmachines to replace human effort for this particular task. The CNC machine\naccepts a G-code script produced from a binary image in which the wood region\nthat needs to be chiselled off is marked with (0, 0, 0) as its RGB value.\nRather than requiring a human expert to perform the region marking, we propose\nusing a Deep learning image segmentation method instead. Our setup involves a\ncamera that captures the cross-section image and then passes the image file to\na computer. The computer performs the automated image segmentation and feeds\nthe CNC machine with a G-code script. In this article, we report the initial\nsegmentation results achieved using a state-of-the-art Deep learning\nsegmentation method and discuss potential improvements to refine the\nsegmentation accuracy.\n","authors":["Irwandi Hipiny","Johari Abdullah","Noor Alamshah Bolhassan"],"pdf_url":"https://arxiv.org/pdf/2404.05129v1.pdf","comment":"15 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2207.01200v4","updated":"2024-04-08T01:11:22Z","published":"2022-07-04T05:03:10Z","title":"S$^{5}$Mars: Semi-Supervised Learning for Mars Semantic Segmentation","summary":" Deep learning has become a powerful tool for Mars exploration. Mars terrain\nsemantic segmentation is an important Martian vision task, which is the base of\nrover autonomous planning and safe driving. However, there is a lack of\nsufficient detailed and high-confidence data annotations, which are exactly\nrequired by most deep learning methods to obtain a good model. To address this\nproblem, we propose our solution from the perspective of joint data and method\ndesign. We first present a newdataset S5Mars for Semi-SuperviSed learning on\nMars Semantic Segmentation, which contains 6K high-resolution images and is\nsparsely annotated based on confidence, ensuring the high quality of labels.\nThen to learn from this sparse data, we propose a semi-supervised learning\n(SSL) framework for Mars image semantic segmentation, to learn representations\nfrom limited labeled data. Different from the existing SSL methods which are\nmostly targeted at the Earth image data, our method takes into account Mars\ndata characteristics. Specifically, we first investigate the impact of current\nwidely used natural image augmentations on Mars images. Based on the analysis,\nwe then proposed two novel and effective augmentations for SSL of Mars\nsegmentation, AugIN and SAM-Mix, which serve as strong augmentations to boost\nthe model performance. Meanwhile, to fully leverage the unlabeled data, we\nintroduce a soft-to-hard consistency learning strategy, learning from different\ntargets based on prediction confidence. Experimental results show that our\nmethod can outperform state-of-the-art SSL approaches remarkably. Our proposed\ndataset is available at https://jhang2020.github.io/S5Mars.github.io/.\n","authors":["Jiahang Zhang","Lilang Lin","Zejia Fan","Wenjing Wang","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2207.01200v4.pdf","comment":"IEEE TGRS 2024"},{"id":"http://arxiv.org/abs/2404.05128v1","updated":"2024-04-08T01:08:41Z","published":"2024-04-08T01:08:41Z","title":"Improving Deep Learning Predictions with Simulated Images, and Vice\n Versa","summary":" Artificial neural networks are often used to identify features of crop\nplants. However, training their models requires many annotated images, which\ncan be expensive and time-consuming to acquire. Procedural models of plants,\nsuch as those developed with Lindenmayer-systems (L-systems) can be created to\nproduce visually realistic simulations, and hence images of plant simulations,\nwhere annotations are implicitly known. These synthetic images can either\naugment or completely replace real images in training neural networks for\nphenotyping tasks. In this paper, we systematically vary amounts of real and\nsynthetic images used for training in both maize and canola to better\nunderstand situations where synthetic images generated from L-systems can help\nprediction on real images. This work also explores the degree to which realism\nin the synthetic images improves prediction. Furthermore, we see how neural\nnetwork predictions can be used to help calibrate L-systems themselves,\ncreating a feedback loop.\n","authors":["Nazifa Azam Khan","Mikolaj Cieslak","Ian McQuillan"],"pdf_url":"https://arxiv.org/pdf/2404.05128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03202v2","updated":"2024-04-08T01:05:57Z","published":"2024-04-04T05:10:26Z","title":"OmniGS: Omnidirectional Gaussian Splatting for Fast Radiance Field\n Reconstruction using Omnidirectional Images","summary":" Photorealistic reconstruction relying on 3D Gaussian Splatting has shown\npromising potential in robotics. However, the current 3D Gaussian Splatting\nsystem only supports radiance field reconstruction using undistorted\nperspective images. In this paper, we present OmniGS, a novel omnidirectional\nGaussian splatting system, to take advantage of omnidirectional images for fast\nradiance field reconstruction. Specifically, we conduct a theoretical analysis\nof spherical camera model derivatives in 3D Gaussian Splatting. According to\nthe derivatives, we then implement a new GPU-accelerated omnidirectional\nrasterizer that directly splats 3D Gaussians onto the equirectangular screen\nspace for omnidirectional image rendering. As a result, we realize\ndifferentiable optimization of the radiance field without the requirement of\ncube-map rectification or tangent-plane approximation. Extensive experiments\nconducted in egocentric and roaming scenarios demonstrate that our method\nachieves state-of-the-art reconstruction quality and high rendering speed using\nomnidirectional images. To benefit the research community, the code will be\nmade publicly available once the paper is published.\n","authors":["Longwei Li","Huajian Huang","Sai-Kit Yeung","Hui Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03202v2.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.05111v1","updated":"2024-04-08T00:13:05Z","published":"2024-04-08T00:13:05Z","title":"Class Similarity Transition: Decoupling Class Similarities and Imbalance\n from Generalized Few-shot Segmentation","summary":" In Generalized Few-shot Segmentation (GFSS), a model is trained with a large\ncorpus of base class samples and then adapted on limited samples of novel\nclasses. This paper focuses on the relevance between base and novel classes,\nand improves GFSS in two aspects: 1) mining the similarity between base and\nnovel classes to promote the learning of novel classes, and 2) mitigating the\nclass imbalance issue caused by the volume difference between the support set\nand the training set. Specifically, we first propose a similarity transition\nmatrix to guide the learning of novel classes with base class knowledge. Then,\nwe leverage the Label-Distribution-Aware Margin (LDAM) loss and Transductive\nInference to the GFSS task to address the problem of class imbalance as well as\noverfitting the support set. In addition, by extending the probability\ntransition matrix, the proposed method can mitigate the catastrophic forgetting\nof base classes when learning novel classes. With a simple training phase, our\nproposed method can be applied to any segmentation network trained on base\nclasses. We validated our methods on the adapted version of OpenEarthMap.\nCompared to existing GFSS baselines, our method excels them all from 3% to 7%\nand ranks second in the OpenEarthMap Land Cover Mapping Few-Shot Challenge at\nthe completion of this paper. Code:\nhttps://github.com/earth-insights/ClassTrans\n","authors":["Shihong Wang","Ruixun Liu","Kaiyu Li","Jiawei Jiang","Xiangyong Cao"],"pdf_url":"https://arxiv.org/pdf/2404.05111v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.09250v2","updated":"2024-04-08T22:40:01Z","published":"2023-12-14T18:59:36Z","title":"Single Mesh Diffusion Models with Field Latents for Texture Generation","summary":" We introduce a framework for intrinsic latent diffusion models operating\ndirectly on the surfaces of 3D shapes, with the goal of synthesizing\nhigh-quality textures. Our approach is underpinned by two contributions: field\nlatents, a latent representation encoding textures as discrete vector fields on\nthe mesh vertices, and field latent diffusion models, which learn to denoise a\ndiffusion process in the learned latent space on the surface. We consider a\nsingle-textured-mesh paradigm, where our models are trained to generate\nvariations of a given texture on a mesh. We show the synthesized textures are\nof superior fidelity compared those from existing single-textured-mesh\ngenerative models. Our models can also be adapted for user-controlled editing\ntasks such as inpainting and label-guided generation. The efficacy of our\napproach is due in part to the equivariance of our proposed framework under\nisometries, allowing our models to seamlessly reproduce details across locally\nsimilar regions and opening the door to a notion of generative texture\ntransfer.\n","authors":["Thomas W. Mitchel","Carlos Esteves","Ameesh Makadia"],"pdf_url":"https://arxiv.org/pdf/2312.09250v2.pdf","comment":"CVPR 2024. Code and additional visualizations available:\n https://single-mesh-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2311.12539v2","updated":"2024-04-08T22:19:23Z","published":"2023-11-21T11:33:15Z","title":"GMISeg: General Medical Image Segmentation without Re-Training","summary":" Although deep learning models have become the main method for medical image\nsegmentation, they often cannot be extended to unknown segmentation tasks\ninvolving new anatomical structures, image shapes, or labels. For new\nsegmentation tasks, researchers often have to retrain or fine-tune the model,\nwhich is time-consuming and poses a significant obstacle to clinical\nresearchers, who often lack the resources and professional knowledge to train\nneural networks. Therefore, we proposed a general method that can solve unknown\nmedical image segmentation tasks without requiring additional training. Given\nan example set of images and prompts for defining new segmentation tasks,\nGMISeg applies a novel low-rank fine-tuning strategy based on the proposed\napproach to the SAM (Segment Anything Model) image encoder, and works with the\nprompt encoder and mask decoder to fine-tune the labeled dataset without the\nneed for additional training. To achieve generalization of new tasks, we used\nmedical image datasets with different imaging modes for different parts. We\ntrained and generalized GMISeg on a different set of anatomical and imaging\nmodes using cardiac images on other site datasets. We have demonstrated that\nGMISeg outperforms the latest methods on unknown tasks and have conducted a\ncomprehensive analysis and summary of the important performance of the proposed\nmethod.\n","authors":["Jing Xu"],"pdf_url":"https://arxiv.org/pdf/2311.12539v2.pdf","comment":null}]},"2024-04-07T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2404.05100v1","updated":"2024-04-07T22:52:02Z","published":"2024-04-07T22:52:02Z","title":"Legibot: Generating Legible Motions for Service Robots Using Cost-Based\n Local Planners","summary":" With the increasing presence of social robots in various environments and\napplications, there is an increasing need for these robots to exhibit\nsocially-compliant behaviors. Legible motion, characterized by the ability of a\nrobot to clearly and quickly convey intentions and goals to the individuals in\nits vicinity, through its motion, holds significant importance in this context.\nThis will improve the overall user experience and acceptance of robots in human\nenvironments. In this paper, we introduce a novel approach to incorporate\nlegibility into local motion planning for mobile robots. This can enable robots\nto generate legible motions in real-time and dynamic environments. To\ndemonstrate the effectiveness of our proposed methodology, we also provide a\nrobotic stack designed for deploying legibility-aware motion planning in a\nsocial robot, by integrating perception and localization components.\n","authors":["Javad Amirian","Mouad Abrini","Mohamed Chetouani"],"pdf_url":"https://arxiv.org/pdf/2404.05100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05087v1","updated":"2024-04-07T22:02:53Z","published":"2024-04-07T22:02:53Z","title":"PCBot: a Minimalist Robot Designed for Swarm Applications","summary":" Complexity, cost, and power requirements for the actuation of individual\nrobots can play a large factor in limiting the size of robotic swarms. Here we\npresent PCBot, a minimalist robot that can precisely move on an orbital shake\ntable using a bi-stable solenoid actuator built directly into its PCB. This\nallows the actuator to be built as part of the automated PCB manufacturing\nprocess, greatly reducing the impact it has on manual assembly. Thanks to this\nnovel actuator design, PCBot has merely five major components and can be\nassembled in under 20 seconds, potentially enabling them to be easily\nmass-manufactured. Here we present the electro-magnetic and mechanical design\nof PCBot. Additionally, a prototype robot is used to demonstrate its ability to\nmove in a straight line as well as follow given paths.\n","authors":["Jingxian Wang","Michael Rubenstein"],"pdf_url":"https://arxiv.org/pdf/2404.05087v1.pdf","comment":"Accepted by IROS 2022, best paper and best mechanism design paper\n finalist"},{"id":"http://arxiv.org/abs/2403.13132v2","updated":"2024-04-07T21:13:35Z","published":"2024-03-19T20:04:35Z","title":"Wearable Roller Rings to Enable Robot Dexterous In-Hand Manipulation\n through Active Surfaces","summary":" In-hand manipulation is a crucial ability for reorienting and repositioning\nobjects within grasps. The main challenges are not only the complexity in the\ncomputational models, but also the risks of grasp instability caused by active\nfinger motions, such as rolling, sliding, breaking, and remaking contacts.\nBased on the idea of manipulation without lifting a finger, this paper presents\nthe development of Roller Rings (RR), a modular robotic attachment with active\nsurfaces that is wearable by both robot and human hands. By installing and\nangling the RRs on grasping systems, such that their spatial motions are not\nco-linear, we derive a general differential motion model for the object\nactuated by the active surfaces. Our motion model shows that complete in-hand\nmanipulation skill sets can be provided by as few as only 2 RRs through\nnon-holonomic object motions, while more RRs can enable enhanced manipulation\ndexterity with fewer motion constraints. Through extensive experiments, we wear\nRRs on both a robot hand and a human hand to evaluate their manipulation\ncapabilities, and show that the RRs can be employed to manipulate arbitrary\nobject shapes to provide dexterous in-hand manipulation.\n","authors":["Hayden Webb","Podshara Chanrungmaneekul","Shenli Yuan","Kaiyu Hang"],"pdf_url":"https://arxiv.org/pdf/2403.13132v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05074v1","updated":"2024-04-07T21:06:52Z","published":"2024-04-07T21:06:52Z","title":"On the Uniqueness of Solution for the Bellman Equation of LTL Objectives","summary":" Surrogate rewards for linear temporal logic (LTL) objectives are commonly\nutilized in planning problems for LTL objectives. In a widely-adopted surrogate\nreward approach, two discount factors are used to ensure that the expected\nreturn approximates the satisfaction probability of the LTL objective. The\nexpected return then can be estimated by methods using the Bellman updates such\nas reinforcement learning. However, the uniqueness of the solution to the\nBellman equation with two discount factors has not been explicitly discussed.\nWe demonstrate with an example that when one of the discount factors is set to\none, as allowed in many previous works, the Bellman equation may have multiple\nsolutions, leading to inaccurate evaluation of the expected return. We then\npropose a condition for the Bellman equation to have the expected return as the\nunique solution, requiring the solutions for states inside a rejecting bottom\nstrongly connected component (BSCC) to be 0. We prove this condition is\nsufficient by showing that the solutions for the states with discounting can be\nseparated from those for the states without discounting under this condition\n","authors":["Zetong Xuan","Alper Kamil Bozkurt","Miroslav Pajic","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05074v1.pdf","comment":"Accepted for the 2024 Learning for Dynamics and Control Conference\n (L4DC)"},{"id":"http://arxiv.org/abs/2404.05067v1","updated":"2024-04-07T20:37:08Z","published":"2024-04-07T20:37:08Z","title":"Adaptive Anchor Pairs Selection in a TDOA-based System Through Robot\n Localization Error Minimization","summary":" The following paper presents an adaptive anchor pairs selection method for\nultra-wideband (UWB) Time Difference of Arrival (TDOA) based positioning\nsystems. The method divides the area covered by the system into several zones\nand assigns them anchor pair sets. The pair sets are determined during\ncalibration based on localization root mean square error (RMSE). The\ncalibration assumes driving a mobile platform equipped with a LiDAR sensor and\na UWB tag through the specified zones. The robot is localized separately based\non a large set of different TDOA pairs and using a LiDAR, which acts as the\nreference. For each zone, the TDOA pairs set for which the registered RMSE is\nlowest is selected and used for localization in the routine system work. The\nproposed method has been tested with simulations and experiments. The results\nfor both simulated static and experimental dynamic scenarios have proven that\nthe adaptive selection of the anchor nodes leads to an increase in localization\naccuracy. In the experiment, the median trajectory error for a moving person\nlocalization was at a level of 25 cm.\n","authors":["Marcin Kolakowski"],"pdf_url":"https://arxiv.org/pdf/2404.05067v1.pdf","comment":"Originally presented at: 2021 Signal Processing Symposium (SPSympo),\n LODZ, Poland, 2021"},{"id":"http://arxiv.org/abs/2403.13245v2","updated":"2024-04-07T19:25:47Z","published":"2024-03-20T02:16:54Z","title":"Federated reinforcement learning for robot motion planning with\n zero-shot generalization","summary":" This paper considers the problem of learning a control policy for robot\nmotion planning with zero-shot generalization, i.e., no data collection and\npolicy adaptation is needed when the learned policy is deployed in new\nenvironments. We develop a federated reinforcement learning framework that\nenables collaborative learning of multiple learners and a central server, i.e.,\nthe Cloud, without sharing their raw data. In each iteration, each learner\nuploads its local control policy and the corresponding estimated normalized\narrival time to the Cloud, which then computes the global optimum among the\nlearners and broadcasts the optimal policy to the learners. Each learner then\nselects between its local control policy and that from the Cloud for next\niteration. The proposed framework leverages on the derived zero-shot\ngeneralization guarantees on arrival time and safety. Theoretical guarantees on\nalmost-sure convergence, almost consensus, Pareto improvement and optimality\ngap are also provided. Monte Carlo simulation is conducted to evaluate the\nproposed framework.\n","authors":["Zhenyuan Yuan","Siyuan Xu","Minghui Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.13245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05039v1","updated":"2024-04-07T18:47:52Z","published":"2024-04-07T18:47:52Z","title":"StaccaToe: A Single-Leg Robot that Mimics the Human Leg and Toe","summary":" We introduce StaccaToe, a human-scale, electric motor-powered single-leg\nrobot designed to rival the agility of human locomotion through two distinctive\nattributes: an actuated toe and a co-actuation configuration inspired by the\nhuman leg. Leveraging the foundational design of HyperLeg's lower leg\nmechanism, we develop a stand-alone robot by incorporating new link designs,\ncustom-designed power electronics, and a refined control system. Unlike\nprevious jumping robots that rely on either special mechanisms (e.g., springs\nand clutches) or hydraulic/pneumatic actuators, StaccaToe employs electric\nmotors without energy storage mechanisms. This choice underscores our ultimate\ngoal of developing a practical, high-performance humanoid robot capable of\nhuman-like, stable walking as well as explosive dynamic movements. In this\npaper, we aim to empirically evaluate the balance capability and the exertion\nof explosive ground reaction forces of our toe and co-actuation mechanisms.\nThroughout extensive hardware and controller development, StaccaToe showcases\nits control fidelity by demonstrating a balanced tip-toe stance and dynamic\njump. This study is significant for three key reasons: 1) StaccaToe represents\nthe first human-scale, electric motor-driven single-leg robot to execute\ndynamic maneuvers without relying on specialized mechanisms; 2) our research\nprovides empirical evidence of the benefits of replicating critical human leg\nattributes in robotic design; and 3) we explain the design process for creating\nagile legged robots, the details that have been scantily covered in academic\nliterature.\n","authors":["Nisal Perera","Shangqun Yu","Daniel Marew","Mack Tang","Ken Suzuki","Aidan McCormack","Shifan Zhu","Yong-Jae Kim","Donghyun Kim"],"pdf_url":"https://arxiv.org/pdf/2404.05039v1.pdf","comment":"Submitted to 2024 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2404.05024v1","updated":"2024-04-07T17:31:53Z","published":"2024-04-07T17:31:53Z","title":"PathFinder: Attention-Driven Dynamic Non-Line-of-Sight Tracking with a\n Mobile Robot","summary":" The study of non-line-of-sight (NLOS) imaging is growing due to its many\npotential applications, including rescue operations and pedestrian detection by\nself-driving cars. However, implementing NLOS imaging on a moving camera\nremains an open area of research. Existing NLOS imaging methods rely on\ntime-resolved detectors and laser configurations that require precise optical\nalignment, making it difficult to deploy them in dynamic environments. This\nwork proposes a data-driven approach to NLOS imaging, PathFinder, that can be\nused with a standard RGB camera mounted on a small, power-constrained mobile\nrobot, such as an aerial drone. Our experimental pipeline is designed to\naccurately estimate the 2D trajectory of a person who moves in a\nManhattan-world environment while remaining hidden from the camera's\nfield-of-view. We introduce a novel approach to process a sequence of dynamic\nsuccessive frames in a line-of-sight (LOS) video using an attention-based\nneural network that performs inference in real-time. The method also includes a\npreprocessing selection metric that analyzes images from a moving camera which\ncontain multiple vertical planar surfaces, such as walls and building facades,\nand extracts planes that return maximum NLOS information. We validate the\napproach on in-the-wild scenes using a drone for video capture, thus\ndemonstrating low-cost NLOS imaging in dynamic capture environments.\n","authors":["Shenbagaraj Kannapiran","Sreenithy Chandran","Suren Jayasuriya","Spring Berman"],"pdf_url":"https://arxiv.org/pdf/2404.05024v1.pdf","comment":"First two authors have equal contribution"},{"id":"http://arxiv.org/abs/2404.05023v1","updated":"2024-04-07T17:30:57Z","published":"2024-04-07T17:30:57Z","title":"Scalable and Efficient Hierarchical Visual Topological Mapping","summary":" Hierarchical topological representations can significantly reduce search\ntimes within mapping and localization algorithms. Although recent research has\nshown the potential for such approaches, limited consideration has been given\nto the suitability and comparative performance of different global feature\nrepresentations within this context. In this work, we evaluate state-of-the-art\nhand-crafted and learned global descriptors using a hierarchical topological\nmapping technique on benchmark datasets and present results of a comprehensive\nevaluation of the impact of the global descriptor used. Although learned\ndescriptors have been incorporated into place recognition methods to improve\nretrieval accuracy and enhance overall recall, the problem of scalability and\nefficiency when applied to longer trajectories has not been adequately\naddressed in a majority of research studies. Based on our empirical analysis of\nmultiple runs, we identify that continuity and distinctiveness are crucial\ncharacteristics for an optimal global descriptor that enable efficient and\nscalable hierarchical mapping, and present a methodology for quantifying and\ncontrasting these characteristics across different global descriptors. Our\nstudy demonstrates that the use of global descriptors based on an unsupervised\nlearned Variational Autoencoder (VAE) excels in these characteristics and\nachieves significantly lower runtime. It runs on a consumer grade desktop, up\nto 2.3x faster than the second best global descriptor, NetVLAD, and up to 9.5x\nfaster than the hand-crafted descriptor, PHOG, on the longest track evaluated\n(St Lucia, 17.6 km), without sacrificing overall recall performance.\n","authors":["Saravanabalagi Ramachandran","Jonathan Horgan","Ganesh Sistu","John McDonald"],"pdf_url":"https://arxiv.org/pdf/2404.05023v1.pdf","comment":"Published in the 21st International Conference on Advanced Robotics\n (ICAR 2023)"},{"id":"http://arxiv.org/abs/2404.04929v1","updated":"2024-04-07T12:05:47Z","published":"2024-04-07T12:05:47Z","title":"RoboMP$^2$: A Robotic Multimodal Perception-Planning Framework with\n Multimodal Large Language Models","summary":" Multimodal Large Language Models (MLLMs) have shown impressive reasoning\nabilities and general intelligence in various domains. It inspires researchers\nto train end-to-end MLLMs or utilize large models to generate policies with\nhuman-selected prompts for embodied agents. However, these methods exhibit\nlimited generalization capabilities on unseen tasks or scenarios, and overlook\nthe multimodal environment information which is critical for robots to make\ndecisions. In this paper, we introduce a novel Robotic Multimodal\nPerception-Planning (RoboMP$^2$) framework for robotic manipulation which\nconsists of a Goal-Conditioned Multimodal Preceptor (GCMP) and a\nRetrieval-Augmented Multimodal Planner (RAMP). Specially, GCMP captures\nenvironment states by employing a tailored MLLMs for embodied agents with the\nabilities of semantic reasoning and localization. RAMP utilizes coarse-to-fine\nretrieval method to find the $k$ most-relevant policies as in-context\ndemonstrations to enhance the planner. Extensive experiments demonstrate the\nsuperiority of RoboMP$^2$ on both VIMA benchmark and real-world tasks, with\naround 10% improvement over the baselines.\n","authors":["Qi Lv","Hao Li","Xiang Deng","Rui Shao","Michael Yu Wang","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2404.04929v1.pdf","comment":"Project page: https://aopolin-lv.github.io/RoboMP2.github.io/"},{"id":"http://arxiv.org/abs/2310.17923v3","updated":"2024-04-07T11:28:46Z","published":"2023-10-27T06:37:33Z","title":"Multi-fingered Dynamic Grasping for Unknown Objects","summary":" Dexterous grasping of unseen objects in dynamic environments is an essential\nprerequisite for the advanced manipulation of autonomous robots. Prior advances\nrely on several assumptions that simplify the setup, including environment\nstationarity, pre-defined objects, and low-dimensional end-effectors. Though\neasing the problem and enabling progress, it undermined the complexity of the\nreal world. Aiming to relax these assumptions, we present a dynamic grasping\nframework for unknown objects in this work, which uses a five-fingered hand\nwith visual servo control and can compensate for external disturbances. To\nestablish such a system on real hardware, we leverage the recent advances in\nreal-time dexterous generative grasp synthesis and introduce several techniques\nto secure the robustness and performance of the overall system. Our experiments\non real hardware verify the ability of the proposed system to reliably grasp\nunknown dynamic objects in two realistic scenarios: objects on a conveyor belt\nand human-robot handover. Note that there has been no prior work that can\nachieve dynamic multi-fingered grasping for unknown objects like ours up to the\ntime of writing this paper. We hope our pioneering work in this direction can\nprovide inspiration to the community and pave the way for further algorithmic\nand engineering advances on this challenging task. A video of the experiments\nis available at https://youtu.be/b87zGNoKELg.\n","authors":["Yannick Burkhardt","Qian Feng","Jianxiang Feng","Karan Sharma","Zhaopeng Chen","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2310.17923v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07164v2","updated":"2024-04-07T10:12:35Z","published":"2024-01-13T21:53:17Z","title":"3QFP: Efficient neural implicit surface reconstruction using\n Tri-Quadtrees and Fourier feature Positional encoding","summary":" Neural implicit surface representations are currently receiving a lot of\ninterest as a means to achieve high-fidelity surface reconstruction at a low\nmemory cost, compared to traditional explicit representations.However,\nstate-of-the-art methods still struggle with excessive memory usage and\nnon-smooth surfaces. This is particularly problematic in large-scale\napplications with sparse inputs, as is common in robotics use cases. To address\nthese issues, we first introduce a sparse structure, \\emph{tri-quadtrees},\nwhich represents the environment using learnable features stored in three\nplanar quadtree projections. Secondly, we concatenate the learnable features\nwith a Fourier feature positional encoding. The combined features are then\ndecoded into signed distance values through a small multi-layer perceptron. We\ndemonstrate that this approach facilitates smoother reconstruction with a\nhigher completion ratio with fewer holes. Compared to two recent baselines, one\nimplicit and one explicit, our approach requires only 10\\%--50\\% as much\nmemory, while achieving competitive quality.\n","authors":["Shuo Sun","Malcolm Mielle","Achim J. Lilienthal","Martin Magnusson"],"pdf_url":"https://arxiv.org/pdf/2401.07164v2.pdf","comment":"ICRA2024"},{"id":"http://arxiv.org/abs/2308.16874v2","updated":"2024-04-07T08:53:20Z","published":"2023-08-31T17:21:18Z","title":"D-VAT: End-to-End Visual Active Tracking for Micro Aerial Vehicles","summary":" Visual active tracking is a growing research topic in robotics due to its key\nrole in applications such as human assistance, disaster recovery, and\nsurveillance. In contrast to passive tracking, active tracking approaches\ncombine vision and control capabilities to detect and actively track the\ntarget. Most of the work in this area focuses on ground robots, while the very\nfew contributions on aerial platforms still pose important design constraints\nthat limit their applicability. To overcome these limitations, in this paper we\npropose D-VAT, a novel end-to-end visual active tracking methodology based on\ndeep reinforcement learning that is tailored to micro aerial vehicle platforms.\nThe D-VAT agent computes the vehicle thrust and angular velocity commands\nneeded to track the target by directly processing monocular camera\nmeasurements. We show that the proposed approach allows for precise and\ncollision-free tracking operations, outperforming different state-of-the-art\nbaselines on simulated environments which differ significantly from those\nencountered during training. Moreover, we demonstrate a smooth real-world\ntransition to a quadrotor platform with mixed-reality.\n","authors":["Alberto Dionigi","Simone Felicioni","Mirko Leomanni","Gabriele Costante"],"pdf_url":"https://arxiv.org/pdf/2308.16874v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04879v1","updated":"2024-04-07T08:49:09Z","published":"2024-04-07T08:49:09Z","title":"Multi-Type Map Construction via Semantics-Aware Autonomous Exploration\n in Unknown Indoor Environments","summary":" This paper proposes a novel semantics-aware autonomous exploration model to\nhandle the long-standing issue: the mainstream RRT (Rapid-exploration Random\nTree) based exploration models usually make the mobile robot switch frequently\nbetween different regions, leading to the excessively-repeated explorations for\nthe same region. Our proposed semantics-aware model encourages a mobile robot\nto fully explore the current region before moving to the next region, which is\nable to avoid excessively-repeated explorations and make the exploration\nfaster. The core idea of semantics-aware autonomous exploration model is\noptimizing the sampling point selection mechanism and frontier point evaluation\nfunction by considering the semantic information of regions. In addition,\ncompared with existing autonomous exploration methods that usually construct\nthe single-type or 2-3 types of maps, our model allows to construct four kinds\nof maps including point cloud map, occupancy grid map, topological map, and\nsemantic map. To test the performance of our model, we conducted experiments in\nthree simulated environments. The experiment results demonstrate that compared\nto Improved RRT, our model achieved 33.0% exploration time reduction and 39.3%\nexploration trajectory length reduction when maintaining >98% exploration rate.\n","authors":["Jianfang Mao","Yuheng Xie","Si Chen","Zhixiong Nan","Xiao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04869v1","updated":"2024-04-07T08:31:12Z","published":"2024-04-07T08:31:12Z","title":"Prompting Multi-Modal Tokens to Enhance End-to-End Autonomous Driving\n Imitation Learning with LLMs","summary":" The utilization of Large Language Models (LLMs) within the realm of\nreinforcement learning, particularly as planners, has garnered a significant\ndegree of attention in recent scholarly literature. However, a substantial\nproportion of existing research predominantly focuses on planning models for\nrobotics that transmute the outputs derived from perception models into\nlinguistic forms, thus adopting a `pure-language' strategy. In this research,\nwe propose a hybrid End-to-End learning framework for autonomous driving by\ncombining basic driving imitation learning with LLMs based on multi-modality\nprompt tokens. Instead of simply converting perception results from the\nseparated train model into pure language input, our novelty lies in two\naspects. 1) The end-to-end integration of visual and LiDAR sensory input into\nlearnable multi-modality tokens, thereby intrinsically alleviating description\nbias by separated pre-trained perception models. 2) Instead of directly letting\nLLMs drive, this paper explores a hybrid setting of letting LLMs help the\ndriving model correct mistakes and complicated scenarios. The results of our\nexperiments suggest that the proposed methodology can attain driving scores of\n49.21%, coupled with an impressive route completion rate of 91.34% in the\noffline evaluation conducted via CARLA. These performance metrics are\ncomparable to the most advanced driving models.\n","authors":["Yiqun Duan","Qiang Zhang","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2404.04869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04857v1","updated":"2024-04-07T08:04:33Z","published":"2024-04-07T08:04:33Z","title":"Learning Adaptive Multi-Objective Robot Navigation with Demonstrations","summary":" Preference-aligned robot navigation in human environments is typically\nachieved through learning-based approaches, utilizing demonstrations and user\nfeedback for personalization. However, personal preferences are subject to\nchange and might even be context-dependent. Yet traditional reinforcement\nlearning (RL) approaches with a static reward function often fall short in\nadapting to these varying user preferences. This paper introduces a framework\nthat combines multi-objective reinforcement learning (MORL) with\ndemonstration-based learning. Our approach allows for dynamic adaptation to\nchanging user preferences without retraining. Through rigorous evaluations,\nincluding sim-to-real and robot-to-robot transfers, we demonstrate our\nframework's capability to reflect user preferences accurately while achieving\nhigh navigational performance in terms of collision avoidance and goal\npursuance.\n","authors":["Jorge de Heuvel","Tharun Sethuraman","Maren Bennewitz"],"pdf_url":"https://arxiv.org/pdf/2404.04857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04852v1","updated":"2024-04-07T07:50:26Z","published":"2024-04-07T07:50:26Z","title":"EnQuery: Ensemble Policies for Diverse Query-Generation in Preference\n Alignment of Robot Navigation","summary":" To align mobile robot navigation policies with user preferences through\nreinforcement learning from human feedback (RLHF), reliable and\nbehavior-diverse user queries are required. However, deterministic policies\nfail to generate a variety of navigation trajectory suggestions for a given\nnavigation task configuration. We introduce EnQuery, a query generation\napproach using an ensemble of policies that achieve behavioral diversity\nthrough a regularization term. For a given navigation task, EnQuery produces\nmultiple navigation trajectory suggestions, thereby optimizing the efficiency\nof preference data collection with fewer queries. Our methodology demonstrates\nsuperior performance in aligning navigation policies with user preferences in\nlow-query regimes, offering enhanced policy convergence from sparse preference\nqueries. The evaluation is complemented with a novel explainability\nrepresentation, capturing full scene navigation behavior of the mobile robot in\na single plot.\n","authors":["Jorge de Heuvel","Florian Seiler","Maren Bennewitz"],"pdf_url":"https://arxiv.org/pdf/2404.04852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04030v2","updated":"2024-04-07T03:57:37Z","published":"2023-07-08T18:46:19Z","title":"Adaptive Force-Based Control of Dynamic Legged Locomotion over Uneven\n Terrain","summary":" Agile-legged robots have proven to be highly effective in navigating and\nperforming tasks in complex and challenging environments, including disaster\nzones and industrial settings. However, these applications normally require the\ncapability of carrying heavy loads while maintaining dynamic motion. Therefore,\nthis paper presents a novel methodology for incorporating adaptive control into\na force-based control system. Recent advancements in the control of quadruped\nrobots show that force control can effectively realize dynamic locomotion over\nrough terrain. By integrating adaptive control into the force-based controller,\nour proposed approach can maintain the advantages of the baseline framework\nwhile adapting to significant model uncertainties and unknown terrain impact\nmodels. Experimental validation was successfully conducted on the Unitree A1\nrobot. With our approach, the robot can carry heavy loads (up to 50% of its\nweight) while performing dynamic gaits such as fast trotting and bounding\nacross uneven terrains.\n","authors":["Mohsen Sombolestan","Quan Nguyen"],"pdf_url":"https://arxiv.org/pdf/2307.04030v2.pdf","comment":"This work has been published in IEEE Transaction on Robotics (T-RO)"},{"id":"http://arxiv.org/abs/2404.00797v2","updated":"2024-04-07T02:05:49Z","published":"2024-03-31T20:59:58Z","title":"Metarobotics for Industry and Society: Vision, Technologies, and\n Opportunities","summary":" Metarobotics aims to combine next generation wireless communication,\nmulti-sense immersion, and collective intelligence to provide a pervasive,\nitinerant, and non-invasive access and interaction with distant robotized\napplications. Industry and society are expected to benefit from these\nfunctionalities. For instance, robot programmers will no longer travel\nworldwide to plan and test robot motions, even collaboratively. Instead, they\nwill have a personalized access to robots and their environments from anywhere,\nthus spending more time with family and friends. Students enrolled in robotics\ncourses will be taught under authentic industrial conditions in real-time. This\npaper describes objectives of Metarobotics in society, industry, and\nin-between. It identifies and surveys technologies likely to enable their\ncompletion and provides an architecture to put forward the interplay of key\ncomponents of Metarobotics. Potentials for self-determination, self-efficacy,\nand work-life-flexibility in robotics-related applications in Society 5.0,\nIndustry 4.0, and Industry 5.0 are outlined.\n","authors":["Eric Guiffo Kaigom"],"pdf_url":"https://arxiv.org/pdf/2404.00797v2.pdf","comment":"Published on IEEE Transactions on Industrial Informatics, Volume 20,\n Issue 4, April 2024"},{"id":"http://arxiv.org/abs/2404.04772v1","updated":"2024-04-07T01:13:07Z","published":"2024-04-07T01:13:07Z","title":"Efficient Reinforcement Learning of Task Planners for Robotic\n Palletization through Iterative Action Masking Learning","summary":" The development of robotic systems for palletization in logistics scenarios\nis of paramount importance, addressing critical efficiency and precision\ndemands in supply chain management. This paper investigates the application of\nReinforcement Learning (RL) in enhancing task planning for such robotic\nsystems. Confronted with the substantial challenge of a vast action space,\nwhich is a significant impediment to efficiently apply out-of-the-shelf RL\nmethods, our study introduces a novel method of utilizing supervised learning\nto iteratively prune and manage the action space effectively. By reducing the\ncomplexity of the action space, our approach not only accelerates the learning\nphase but also ensures the effectiveness and reliability of the task planning\nin robotic palletization. The experimental results underscore the efficacy of\nthis method, highlighting its potential in improving the performance of RL\napplications in complex and high-dimensional environments like logistics\npalletization.\n","authors":["Zheng Wu","Yichuan Li","Wei Zhan","Changliu Liu","Yun-Hui Liu","Masayoshi Tomizuka"],"pdf_url":"https://arxiv.org/pdf/2404.04772v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.05051v1","updated":"2024-04-07T19:22:51Z","published":"2024-04-07T19:22:51Z","title":"Skill Transfer and Discovery for Sim-to-Real Learning: A\n Representation-Based Viewpoint","summary":" We study sim-to-real skill transfer and discovery in the context of robotics\ncontrol using representation learning. We draw inspiration from spectral\ndecomposition of Markov decision processes. The spectral decomposition brings\nabout representation that can linearly represent the state-action value\nfunction induced by any policies, thus can be regarded as skills. The skill\nrepresentations are transferable across arbitrary tasks with the same\ntransition dynamics. Moreover, to handle the sim-to-real gap in the dynamics,\nwe propose a skill discovery algorithm that learns new skills caused by the\nsim-to-real gap from real-world data. We promote the discovery of new skills by\nenforcing orthogonal constraints between the skills to learn and the skills\nfrom simulators, and then synthesize the policy using the enlarged skill sets.\nWe demonstrate our methodology by transferring quadrotor controllers from\nsimulators to Crazyflie 2.1 quadrotors. We show that we can learn the skill\nrepresentations from a single simulator task and transfer these to multiple\ndifferent real-world tasks including hovering, taking off, landing and\ntrajectory tracking. Our skill discovery approach helps narrow the sim-to-real\ngap and improve the real-world controller performance by up to 30.2%.\n","authors":["Haitong Ma","Zhaolin Ren","Bo Dai","Na Li"],"pdf_url":"https://arxiv.org/pdf/2404.05051v1.pdf","comment":"9 pages, 6 figures. Project page:\n https://congharvard.github.io/steady-sim-to-real/"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.05107v1","updated":"2024-04-07T23:31:37Z","published":"2024-04-07T23:31:37Z","title":"Reconstructing Retinal Visual Images from 3T fMRI Data Enhanced by\n Unsupervised Learning","summary":" The reconstruction of human visual inputs from brain activity, particularly\nthrough functional Magnetic Resonance Imaging (fMRI), holds promising avenues\nfor unraveling the mechanisms of the human visual system. Despite the\nsignificant strides made by deep learning methods in improving the quality and\ninterpretability of visual reconstruction, there remains a substantial demand\nfor high-quality, long-duration, subject-specific 7-Tesla fMRI experiments. The\nchallenge arises in integrating diverse smaller 3-Tesla datasets or\naccommodating new subjects with brief and low-quality fMRI scans. In response\nto these constraints, we propose a novel framework that generates enhanced 3T\nfMRI data through an unsupervised Generative Adversarial Network (GAN),\nleveraging unpaired training across two distinct fMRI datasets in 7T and 3T,\nrespectively. This approach aims to overcome the limitations of the scarcity of\nhigh-quality 7-Tesla data and the challenges associated with brief and\nlow-quality scans in 3-Tesla experiments. In this paper, we demonstrate the\nreconstruction capabilities of the enhanced 3T fMRI data, highlighting its\nproficiency in generating superior input visual images compared to\ndata-intensive methods trained and tested on a single subject.\n","authors":["Yujian Xiong","Wenhui Zhu","Zhong-Lin Lu","Yalin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05107v1.pdf","comment":"Accepted by ISBI 2024"},{"id":"http://arxiv.org/abs/2307.05845v5","updated":"2024-04-07T23:27:06Z","published":"2023-07-11T23:36:49Z","title":"PIGEON: Predicting Image Geolocations","summary":" Planet-scale image geolocalization remains a challenging problem due to the\ndiversity of images originating from anywhere in the world. Although approaches\nbased on vision transformers have made significant progress in geolocalization\naccuracy, success in prior literature is constrained to narrow distributions of\nimages of landmarks, and performance has not generalized to unseen places. We\npresent a new geolocalization system that combines semantic geocell creation,\nmulti-task contrastive pretraining, and a novel loss function. Additionally,\nour work is the first to perform retrieval over location clusters for guess\nrefinements. We train two models for evaluations on street-level data and\ngeneral-purpose image geolocalization; the first model, PIGEON, is trained on\ndata from the game of Geoguessr and is capable of placing over 40% of its\nguesses within 25 kilometers of the target location globally. We also develop a\nbot and deploy PIGEON in a blind experiment against humans, ranking in the top\n0.01% of players. We further challenge one of the world's foremost professional\nGeoguessr players to a series of six matches with millions of viewers, winning\nall six games. Our second model, PIGEOTTO, differs in that it is trained on a\ndataset of images from Flickr and Wikipedia, achieving state-of-the-art results\non a wide range of image geolocalization benchmarks, outperforming the previous\nSOTA by up to 7.7 percentage points on the city accuracy level and up to 38.8\npercentage points on the country level. Our findings suggest that PIGEOTTO is\nthe first image geolocalization model that effectively generalizes to unseen\nplaces and that our approach can pave the way for highly accurate, planet-scale\nimage geolocalization systems. Our code is available on GitHub.\n","authors":["Lukas Haas","Michal Skreta","Silas Alberti","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2307.05845v5.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2404.05105v1","updated":"2024-04-07T23:10:26Z","published":"2024-04-07T23:10:26Z","title":"VMambaMorph: a Visual Mamba-based Framework with Cross-Scan Module for\n Deformable 3D Image Registration","summary":" Image registration, a critical process in medical imaging, involves aligning\ndifferent sets of medical imaging data into a single unified coordinate system.\nDeep learning networks, such as the Convolutional Neural Network (CNN)-based\nVoxelMorph, Vision Transformer (ViT)-based TransMorph, and State Space Model\n(SSM)-based MambaMorph, have demonstrated effective performance in this domain.\nThe recent Visual State Space Model (VMamba), which incorporates a cross-scan\nmodule with SSM, has exhibited promising improvements in modeling global-range\ndependencies with efficient computational cost in computer vision tasks. This\npaper hereby introduces an exploration of VMamba with image registration, named\nVMambaMorph. This novel hybrid VMamba-CNN network is designed specifically for\n3D image registration. Utilizing a U-shaped network architecture, VMambaMorph\ncomputes the deformation field based on target and source volumes. The\nVMamba-based block with 2D cross-scan module is redesigned for 3D volumetric\nfeature processing, and a fine-grained feature extraction module is proposed\nfor high-dimensional feature learning. We validate VMambaMorph using a public\nbenchmark brain MR-CT registration dataset, comparing its performance against\ncurrent state-of-the-art methods. The results indicate that VMambaMorph\nachieves competitive registration quality. The code for VMambaMorph is\navailable on GitHub.\n","authors":["Ziyang Wang","Jian-Qing Zheng","Chao Ma","Tao Guo"],"pdf_url":"https://arxiv.org/pdf/2404.05105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05102v1","updated":"2024-04-07T22:58:18Z","published":"2024-04-07T22:58:18Z","title":"LHU-Net: A Light Hybrid U-Net for Cost-Efficient, High-Performance\n Volumetric Medical Image Segmentation","summary":" As a result of the rise of Transformer architectures in medical image\nanalysis, specifically in the domain of medical image segmentation, a multitude\nof hybrid models have been created that merge the advantages of Convolutional\nNeural Networks (CNNs) and Transformers. These hybrid models have achieved\nnotable success by significantly improving segmentation accuracy. Yet, this\nprogress often comes at the cost of increased model complexity, both in terms\nof parameters and computational demand. Moreover, many of these models fail to\nconsider the crucial interplay between spatial and channel features, which\ncould further refine and improve segmentation outcomes. To address this, we\nintroduce LHU-Net, a Light Hybrid U-Net architecture optimized for volumetric\nmedical image segmentation. LHU-Net is meticulously designed to prioritize\nspatial feature analysis in its initial layers before shifting focus to\nchannel-based features in its deeper layers, ensuring a comprehensive feature\nextraction process. Rigorous evaluation across five benchmark datasets -\nSynapse, LA, Pancreas, ACDC, and BRaTS 2018 - underscores LHU-Net's superior\nperformance, showcasing its dual capacity for efficiency and accuracy. Notably,\nLHU-Net sets new performance benchmarks, such as attaining a Dice score of\n92.66 on the ACDC dataset, while simultaneously reducing parameters by 85% and\nquartering the computational load compared to existing state-of-the-art models.\nAchieved without any reliance on pre-training, additional data, or model\nensemble, LHU-Net's effectiveness is further evidenced by its state-of-the-art\nperformance across all evaluated datasets, utilizing fewer than 11 million\nparameters. This achievement highlights that balancing computational efficiency\nwith high accuracy in medical image segmentation is feasible. Our\nimplementation of LHU-Net is freely accessible to the research community on\nGitHub.\n","authors":["Yousef Sadegheih","Afshin Bozorgpour","Pratibha Kumari","Reza Azad","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2404.05102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04001v4","updated":"2024-04-07T22:46:13Z","published":"2023-09-07T20:07:57Z","title":"MMSFormer: Multimodal Transformer for Material and Semantic Segmentation","summary":" Leveraging information across diverse modalities is known to enhance\nperformance on multimodal segmentation tasks. However, effectively fusing\ninformation from different modalities remains challenging due to the unique\ncharacteristics of each modality. In this paper, we propose a novel fusion\nstrategy that can effectively fuse information from different modality\ncombinations. We also propose a new model named Multi-Modal Segmentation\nTransFormer (MMSFormer) that incorporates the proposed fusion strategy to\nperform multimodal material and semantic segmentation tasks. MMSFormer\noutperforms current state-of-the-art models on three different datasets. As we\nbegin with only one input modality, performance improves progressively as\nadditional modalities are incorporated, showcasing the effectiveness of the\nfusion block in combining useful information from diverse input modalities.\nAblation studies show that different modules in the fusion block are crucial\nfor overall model performance. Furthermore, our ablation studies also highlight\nthe capacity of different input modalities to improve performance in the\nidentification of different types of materials. The code and pretrained models\nwill be made available at https://github.com/csiplab/MMSFormer.\n","authors":["Md Kaykobad Reza","Ashley Prater-Bennette","M. Salman Asif"],"pdf_url":"https://arxiv.org/pdf/2309.04001v4.pdf","comment":"Accepted by IEEE Open Journal of Signal Processing. 15 pages, 3\n figures, 9 tables"}]},"2024-04-06T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2311.06694v3","updated":"2024-04-06T22:14:25Z","published":"2023-11-12T00:21:58Z","title":"Which One? Leveraging Context Between Objects and Multiple Views for\n Language Grounding","summary":" When connecting objects and their language referents in an embodied 3D\nenvironment, it is important to note that: (1) an object can be better\ncharacterized by leveraging comparative information between itself and other\nobjects, and (2) an object's appearance can vary with camera position. As such,\nwe present the Multi-view Approach to Grounding in Context (MAGiC), which\nselects an object referent based on language that distinguishes between two\nsimilar objects. By pragmatically reasoning over both objects and across\nmultiple views of those objects, MAGiC improves over the state-of-the-art model\non the SNARE object reference task with a relative error reduction of 12.9\\%\n(representing an absolute improvement of 2.7\\%). Ablation studies show that\nreasoning jointly over object referent candidates and multiple views of each\nobject both contribute to improved accuracy. Code:\nhttps://github.com/rcorona/magic_snare/\n","authors":["Chancharik Mitra","Abrar Anwar","Rodolfo Corona","Dan Klein","Trevor Darrell","Jesse Thomason"],"pdf_url":"https://arxiv.org/pdf/2311.06694v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04070v2","updated":"2024-04-06T21:48:17Z","published":"2024-02-06T15:17:09Z","title":"Spatial Assisted Human-Drone Collaborative Navigation and Interaction\n through Immersive Mixed Reality","summary":" Aerial robots have the potential to play a crucial role in assisting humans\nwith complex and dangerous tasks. Nevertheless, the future industry demands\ninnovative solutions to streamline the interaction process between humans and\ndrones to enable seamless collaboration and efficient co-working. In this\npaper, we present a novel tele-immersive framework that promotes cognitive and\nphysical collaboration between humans and robots through Mixed Reality (MR).\nThis framework incorporates a novel bi-directional spatial awareness and a\nmulti-modal virtual-physical interaction approaches. The former seamlessly\nintegrates the physical and virtual worlds, offering bidirectional egocentric\nand exocentric environmental representations. The latter, leveraging the\nproposed spatial representation, further enhances the collaboration combining a\nrobot planning algorithm for obstacle avoidance with a variable admittance\ncontrol. This allows users to issue commands based on virtual forces while\nmaintaining compatibility with the environment map. We validate the proposed\napproach by performing several collaborative planning and exploration tasks\ninvolving a drone and an user equipped with a MR headset.\n","authors":["Luca Morando","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2402.04070v2.pdf","comment":"Currently Accepted at International Conference on Robotics and\n Automation (ICRA) 2024, Nominated as Finalist for IEEE ICRA 2024 Best Paper\n Award on Unmanned Aerial Vehicles"},{"id":"http://arxiv.org/abs/2312.01183v2","updated":"2024-04-06T17:48:15Z","published":"2023-12-02T17:16:56Z","title":"Comprehensive Robotic Cholecystectomy Dataset (CRCD): Integrating\n Kinematics, Pedal Signals, and Endoscopic Videos","summary":" In recent years, the potential applications of machine learning to Minimally\nInvasive Surgery (MIS) have spurred interest in data sets that can be used to\ndevelop data-driven tools. This paper introduces a novel dataset recorded\nduring ex vivo pseudo-cholecystectomy procedures on pig livers, utilizing the\nda Vinci Research Kit (dVRK). Unlike current datasets, ours bridges a critical\ngap by offering not only full kinematic data but also capturing all pedal\ninputs used during the procedure and providing a time-stamped record of the\nendoscope's movements. Contributed by seven surgeons, this data set introduces\na new dimension to surgical robotics research, allowing the creation of\nadvanced models for automating console functionalities. Our work addresses the\nexisting limitation of incomplete recordings and imprecise kinematic data,\ncommon in other datasets. By introducing two models, dedicated to predicting\nclutch usage and camera activation, we highlight the dataset's potential for\nadvancing automation in surgical robotics. The comparison of methodologies and\ntime windows provides insights into the models' boundaries and limitations.\n","authors":["Ki-Hwan Oh","Leonardo Borgioli","Alberto Mangano","Valentina Valle","Marco Di Pangrazio","Francesco Toti","Gioia Pozza","Luciano Ambrosini","Alvaro Ducas","Milos Zefran","Liaohai Chen","Pier Cristoforo Giulianotti"],"pdf_url":"https://arxiv.org/pdf/2312.01183v2.pdf","comment":"6 pages, 8 figures, 5 tables. Accepted for presentation at the 2024\n International Symposium on Medical Robotics"},{"id":"http://arxiv.org/abs/2404.04698v1","updated":"2024-04-06T17:47:42Z","published":"2024-04-06T17:47:42Z","title":"EAGLE: The First Event Camera Dataset Gathered by an Agile Quadruped\n Robot","summary":" When legged robots perform agile movements, traditional RGB cameras often\nproduce blurred images, posing a challenge for accurate state estimation. Event\ncameras, inspired by biological vision mechanisms, have emerged as a promising\nsolution for capturing high-speed movements and coping with challenging\nlighting conditions, owing to their significant advantages, such as low\nlatency, high temporal resolution, and a high dynamic range. However, the\nintegration of event cameras into agile-legged robots is still largely\nunexplored. Notably, no event camera-based dataset has yet been specifically\ndeveloped for dynamic legged robots. To bridge this gap, we introduce EAGLE\n(Event dataset of an AGile LEgged robot), a new dataset comprising data from an\nevent camera, an RGB-D camera, an IMU, a LiDAR, and joint angle encoders, all\nmounted on a quadruped robotic platform. This dataset features more than 100\nsequences from real-world environments, encompassing various indoor and outdoor\nenvironments, different lighting conditions, a range of robot gaits (e.g.,\ntrotting, bounding, pronking), as well as acrobatic movements such as\nbackflipping. To our knowledge, this is the first event camera dataset to\ninclude multi-sensory data collected by an agile quadruped robot.\n","authors":["Shifan Zhu","Zixun Xiong","Donghyun Kim"],"pdf_url":"https://arxiv.org/pdf/2404.04698v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.04693v1","updated":"2024-04-06T17:41:36Z","published":"2024-04-06T17:41:36Z","title":"OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera\n Fusion for Colorizing Point Clouds","summary":" A Colored point cloud, as a simple and efficient 3D representation, has many\nadvantages in various fields, including robotic navigation and scene\nreconstruction. This representation is now commonly used in 3D reconstruction\ntasks relying on cameras and LiDARs. However, fusing data from these two types\nof sensors is poorly performed in many existing frameworks, leading to\nunsatisfactory mapping results, mainly due to inaccurate camera poses. This\npaper presents OmniColor, a novel and efficient algorithm to colorize point\nclouds using an independent 360-degree camera. Given a LiDAR-based point cloud\nand a sequence of panorama images with initial coarse camera poses, our\nobjective is to jointly optimize the poses of all frames for mapping images\nonto geometric reconstructions. Our pipeline works in an off-the-shelf manner\nthat does not require any feature extraction or matching process. Instead, we\nfind optimal poses by directly maximizing the photometric consistency of LiDAR\nmaps. In experiments, we show that our method can overcome the severe visual\ndistortion of omnidirectional images and greatly benefit from the wide field of\nview (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy\nand stability. The code will be released at\nhttps://github.com/liubonan123/OmniColor/.\n","authors":["Bonan Liu","Guoyang Zhao","Jianhao Jiao","Guang Cai","Chengyang Li","Handi Yin","Yuyang Wang","Ming Liu","Pan Hui"],"pdf_url":"https://arxiv.org/pdf/2404.04693v1.pdf","comment":"2024 IEEE International Conference on Robotics and Automation"},{"id":"http://arxiv.org/abs/2404.04682v1","updated":"2024-04-06T17:02:18Z","published":"2024-04-06T17:02:18Z","title":"Compositional Conservatism: A Transductive Approach in Offline\n Reinforcement Learning","summary":" Offline reinforcement learning (RL) is a compelling framework for learning\noptimal policies from past experiences without additional interaction with the\nenvironment. Nevertheless, offline RL inevitably faces the problem of\ndistributional shifts, where the states and actions encountered during policy\nexecution may not be in the training dataset distribution. A common solution\ninvolves incorporating conservatism into the policy or the value function to\nsafeguard against uncertainties and unknowns. In this work, we focus on\nachieving the same objectives of conservatism but from a different perspective.\nWe propose COmpositional COnservatism with Anchor-seeking (COCOA) for offline\nRL, an approach that pursues conservatism in a compositional manner on top of\nthe transductive reparameterization (Netanyahu et al., 2023), which decomposes\nthe input variable (the state in our case) into an anchor and its difference\nfrom the original input. Our COCOA seeks both in-distribution anchors and\ndifferences by utilizing the learned reverse dynamics model, encouraging\nconservatism in the compositional input space for the policy or value function.\nSuch compositional conservatism is independent of and agnostic to the prevalent\nbehavioral conservatism in offline RL. We apply COCOA to four state-of-the-art\noffline RL algorithms and evaluate them on the D4RL benchmark, where COCOA\ngenerally improves the performance of each algorithm. The code is available at\nhttps://github.com/runamu/compositional-conservatism.\n","authors":["Yeda Song","Dongwook Lee","Gunhee Kim"],"pdf_url":"https://arxiv.org/pdf/2404.04682v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.04677v1","updated":"2024-04-06T16:48:08Z","published":"2024-04-06T16:48:08Z","title":"Salient Sparse Visual Odometry With Pose-Only Supervision","summary":" Visual Odometry (VO) is vital for the navigation of autonomous systems,\nproviding accurate position and orientation estimates at reasonable costs.\nWhile traditional VO methods excel in some conditions, they struggle with\nchallenges like variable lighting and motion blur. Deep learning-based VO,\nthough more adaptable, can face generalization problems in new environments.\nAddressing these drawbacks, this paper presents a novel hybrid visual odometry\n(VO) framework that leverages pose-only supervision, offering a balanced\nsolution between robustness and the need for extensive labeling. We propose two\ncost-effective and innovative designs: a self-supervised homographic\npre-training for enhancing optical flow learning from pose-only labels and a\nrandom patch-based salient point detection strategy for more accurate optical\nflow patch extraction. These designs eliminate the need for dense optical flow\nlabels for training and significantly improve the generalization capability of\nthe system in diverse and challenging environments. Our pose-only supervised\nmethod achieves competitive performance on standard datasets and greater\nrobustness and generalization ability in extreme and unseen scenarios, even\ncompared to dense optical flow-supervised state-of-the-art methods.\n","authors":["Siyu Chen","Kangcheng Liu","Chen Wang","Shenghai Yuan","Jianfei Yang","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2404.04677v1.pdf","comment":"Accepted by IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2404.04653v1","updated":"2024-04-06T15:10:29Z","published":"2024-04-06T15:10:29Z","title":"HawkDrive: A Transformer-driven Visual Perception System for Autonomous\n Driving in Night Scene","summary":" Many established vision perception systems for autonomous driving scenarios\nignore the influence of light conditions, one of the key elements for driving\nsafety. To address this problem, we present HawkDrive, a novel perception\nsystem with hardware and software solutions. Hardware that utilizes stereo\nvision perception, which has been demonstrated to be a more reliable way of\nestimating depth information than monocular vision, is partnered with the edge\ncomputing device Nvidia Jetson Xavier AGX. Our software for low light\nenhancement, depth estimation, and semantic segmentation tasks, is a\ntransformer-based neural network. Our software stack, which enables fast\ninference and noise reduction, is packaged into system modules in Robot\nOperating System 2 (ROS2). Our experimental results have shown that the\nproposed end-to-end system is effective in improving the depth estimation and\nsemantic segmentation performance. Our dataset and codes will be released at\nhttps://github.com/ZionGo6/HawkDrive.\n","authors":["Ziang Guo","Stepan Perminov","Mikhail Konenkov","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2404.04653v1.pdf","comment":"Accepted by IEEE IV 2024"},{"id":"http://arxiv.org/abs/2404.04643v1","updated":"2024-04-06T14:28:01Z","published":"2024-04-06T14:28:01Z","title":"Constrained 6-DoF Grasp Generation on Complex Shapes for Improved\n Dual-Arm Manipulation","summary":" Efficiently generating grasp poses tailored to specific regions of an object\nis vital for various robotic manipulation tasks, especially in a dual-arm\nsetup. This scenario presents a significant challenge due to the complex\ngeometries involved, requiring a deep understanding of the local geometry to\ngenerate grasps efficiently on the specified constrained regions. Existing\nmethods only explore settings involving table-top/small objects and require\naugmented datasets to train, limiting their performance on complex objects. We\npropose CGDF: Constrained Grasp Diffusion Fields, a diffusion-based grasp\ngenerative model that generalizes to objects with arbitrary geometries, as well\nas generates dense grasps on the target regions. CGDF uses a part-guided\ndiffusion approach that enables it to get high sample efficiency in constrained\ngrasping without explicitly training on massive constraint-augmented datasets.\nWe provide qualitative and quantitative comparisons using analytical metrics\nand in simulation, in both unconstrained and constrained settings to show that\nour method can generalize to generate stable grasps on complex objects,\nespecially useful for dual-arm manipulation settings, while existing methods\nstruggle to do so.\n","authors":["Gaurav Singh","Sanket Kalwar","Md Faizal Karim","Bipasha Sen","Nagamanikandan Govindan","Srinath Sridhar","K Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.04643v1.pdf","comment":"Project Page: https://constrained-grasp-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2305.20044v2","updated":"2024-04-06T14:17:11Z","published":"2023-05-31T17:14:25Z","title":"Probabilistic Uncertainty Quantification of Prediction Models with\n Application to Visual Localization","summary":" The uncertainty quantification of prediction models (e.g., neural networks)\nis crucial for their adoption in many robotics applications. This is arguably\nas important as making accurate predictions, especially for safety-critical\napplications such as self-driving cars. This paper proposes our approach to\nuncertainty quantification in the context of visual localization for autonomous\ndriving, where we predict locations from images. Our proposed framework\nestimates probabilistic uncertainty by creating a sensor error model that maps\nan internal output of the prediction model to the uncertainty. The sensor error\nmodel is created using multiple image databases of visual localization, each\nwith ground-truth location. We demonstrate the accuracy of our uncertainty\nprediction framework using the Ithaca365 dataset, which includes variations in\nlighting, weather (sunny, snowy, night), and alignment errors between\ndatabases. We analyze both the predicted uncertainty and its incorporation into\na Kalman-based localization filter. Our results show that prediction error\nvariations increase with poor weather and lighting condition, leading to\ngreater uncertainty and outliers, which can be predicted by our proposed\nuncertainty model. Additionally, our probabilistic error model enables the\nfilter to remove ad hoc sensor gating, as the uncertainty automatically adjusts\nthe model to the input data\n","authors":["Junan Chen","Josephine Monica","Wei-Lun Chao","Mark Campbell"],"pdf_url":"https://arxiv.org/pdf/2305.20044v2.pdf","comment":"Extended version of our ICRA2023 paper"},{"id":"http://arxiv.org/abs/2303.09452v2","updated":"2024-04-06T13:43:43Z","published":"2023-03-16T16:29:17Z","title":"Learning-Based Modeling of Human-Autonomous Vehicle Interaction for\n Improved Safety in Mixed-Vehicle Platooning Control","summary":" The rising presence of autonomous vehicles (AVs) on public roads necessitates\nthe development of advanced control strategies that account for the\nunpredictable nature of human-driven vehicles (HVs). This study introduces a\nlearning-based method for modeling HV behavior, combining a traditional\nfirst-principles approach with a Gaussian process (GP) learning component. This\nhybrid model enhances the accuracy of velocity predictions and provides\nmeasurable uncertainty estimates. We leverage this model to develop a GP-based\nmodel predictive control (GP-MPC) strategy to improve safety in mixed vehicle\nplatoons by integrating uncertainty assessments into distance constraints.\nComparative simulations between our GP-MPC approach and a conventional model\npredictive control (MPC) strategy reveal that the GP-MPC ensures safer\ndistancing and more efficient travel within the mixed platoon. By incorporating\nsparse GP modeling for HVs and a dynamic GP prediction in MPC, we significantly\nreduce the computation time of GP-MPC, making it only marginally longer than\nstandard MPC and approximately 100 times faster than previous models not\nemploying these techniques. Our findings underscore the effectiveness of\nlearning-based HV modeling in enhancing safety and efficiency in mixed-traffic\nenvironments involving AV and HV interactions.\n","authors":["Jie Wang","Yash Vardhan Pant","Zhihao Jiang"],"pdf_url":"https://arxiv.org/pdf/2303.09452v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04589v1","updated":"2024-04-06T10:57:57Z","published":"2024-04-06T10:57:57Z","title":"ars548_ros. An ARS 548 RDI radar driver for ROS2","summary":" The ARS 548 RDI Radar is a premium model of the fifth generation of 77 GHz\nlong range radar sensors with new RF antenna arrays, which offer digital beam\nforming. This radar measures independently the distance, speed and angle of\nobjects without any reflectors in one measurement cycle based on Pulse\nCompression with New Frequency Modulation [1]. Unfortunately, there were not\nany drivers available for Linux systems to make the user able to analyze the\ndata acquired from this sensor to the best of our knowledge. In this paper, we\npresent a driver that is able to interpret the data from the ARS 548 RDI sensor\nand produce data in Robot Operation System version 2 (ROS2). Thus, this data\ncan be stored, represented and analyzed by using the powerful tools offered by\nROS2. Besides, our driver offers advanced object features provided by the\nsensor, such as relative estimated velocity and acceleration of each object,\nits orientation and angular velocity. We focus on the configuration of the\nsensor and the use of our driver and advanced filtering and representation\ntools, offering a video tutorial for these purposes. Finally, a dataset\nacquired with this sensor and an Ouster OS1-32 LiDAR sensor for baseline\npurposes is available, so that the user can check the correctness of our\ndriver.\n","authors":["Fernando Fernández-Calatayud","Lucía Coto-Elena","David Alejo","José J. Carpio-Jiménez","Fernando Caballero","Luis Merino"],"pdf_url":"https://arxiv.org/pdf/2404.04589v1.pdf","comment":"7 pages, 6 figures and 17 references"},{"id":"http://arxiv.org/abs/2309.13586v3","updated":"2024-04-06T06:20:46Z","published":"2023-09-24T09:01:19Z","title":"Task-Oriented Dexterous Hand Pose Synthesis Using Differentiable Grasp\n Wrench Boundary Estimator","summary":" This work tackles the problem of task-oriented dexterous hand pose synthesis,\nwhich involves generating a static hand pose capable of applying a\ntask-specific set of wrenches to manipulate objects. Unlike previous approaches\nthat focus solely on force-closure grasps, which are unsuitable for\nnon-prehensile manipulation tasks (\\textit{e.g.}, turning a knob or pressing a\nbutton), we introduce a unified framework covering force-closure grasps,\nnon-force-closure grasps, and a variety of non-prehensile poses. Our key idea\nis a novel optimization objective quantifying the disparity between the Task\nWrench Space (TWS, the desired wrenches predefined as a task prior) and the\nGrasp Wrench Space (GWS, the achievable wrenches computed from the current hand\npose). By minimizing this objective, gradient-based optimization algorithms can\nsynthesize task-oriented hand poses without additional human demonstrations.\nOur specific contributions include 1) a fast, accurate, and differentiable\ntechnique for estimating the GWS boundary; 2) a task-oriented objective\nfunction based on the disparity between the estimated GWS boundary and the\nprovided TWS boundary; and 3) an efficient implementation of the synthesis\npipeline that leverages CUDA accelerations and supports large-scale\nparalleling. Experimental results on 10 diverse tasks demonstrate a 72.6\\%\nsuccess rate in simulation. Furthermore, real-world validation for 4 tasks\nconfirms the effectiveness of synthesized poses for manipulation. Notably,\ndespite being primarily tailored for task-oriented hand pose synthesis, our\npipeline can generate force-closure grasps 50 times faster than DexGraspNet\nwhile maintaining comparable grasp quality. Project page:\nhttps://pku-epic.github.io/TaskDexGrasp/.\n","authors":["Jiayi Chen","Yuxing Chen","Jialiang Zhang","He Wang"],"pdf_url":"https://arxiv.org/pdf/2309.13586v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04497v1","updated":"2024-04-06T04:15:05Z","published":"2024-04-06T04:15:05Z","title":"Self-organizing Multiagent Target Enclosing under Limited Information\n and Safety Guarantees","summary":" This paper introduces an approach to address the target enclosing problem\nusing non-holonomic multiagent systems, where agents autonomously self-organize\nthemselves in the desired formation around a fixed target. Our approach\ncombines global enclosing behavior and local collision avoidance mechanisms by\ndevising a novel potential function and sliding manifold. In our approach,\nagents independently move toward the desired enclosing geometry when apart and\nactivate the collision avoidance mechanism when a collision is imminent,\nthereby guaranteeing inter-agent safety. We rigorously show that an agent does\nnot need to ensure safety with every other agent and put forth a concept of the\nnearest colliding agent (for any arbitrary agent) with whom ensuring safety is\nsufficient to avoid collisions in the entire swarm. The proposed control\neliminates the need for a fixed or pre-established agent arrangement around the\ntarget and requires only relative information between an agent and the target.\nThis makes our design particularly appealing for scenarios with limited global\ninformation, hence significantly reducing communication requirements. We\nfinally present simulation results to vindicate the efficacy of the proposed\nmethod.\n","authors":["Praveen Kumar Ranjan","Abhinav Sinha","Yongcan Cao"],"pdf_url":"https://arxiv.org/pdf/2404.04497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13801v2","updated":"2024-04-06T04:12:47Z","published":"2024-03-20T17:58:12Z","title":"Natural Language as Policies: Reasoning for Coordinate-Level Embodied\n Control with LLMs","summary":" We demonstrate experimental results with LLMs that address robotics task\nplanning problems. Recently, LLMs have been applied in robotics task planning,\nparticularly using a code generation approach that converts complex high-level\ninstructions into mid-level policy codes. In contrast, our approach acquires\ntext descriptions of the task and scene objects, then formulates task planning\nthrough natural language reasoning, and outputs coordinate level control\ncommands, thus reducing the necessity for intermediate representation code as\npolicies with pre-defined APIs. Our approach is evaluated on a multi-modal\nprompt simulation benchmark, demonstrating that our prompt engineering\nexperiments with natural language reasoning significantly enhance success rates\ncompared to its absence. Furthermore, our approach illustrates the potential\nfor natural language descriptions to transfer robotics skills from known tasks\nto previously unseen tasks. The project website:\nhttps://natural-language-as-policies.github.io/\n","authors":["Yusuke Mikami","Andrew Melnik","Jun Miura","Ville Hautamäki"],"pdf_url":"https://arxiv.org/pdf/2403.13801v2.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.04492v1","updated":"2024-04-06T03:48:29Z","published":"2024-04-06T03:48:29Z","title":"Automated Lane Change Behavior Prediction and Environmental Perception\n Based on SLAM Technology","summary":" In addition to environmental perception sensors such as cameras, radars, etc.\nin the automatic driving system, the external environment of the vehicle is\nperceived, in fact, there is also a perception sensor that has been silently\ndedicated in the system, that is, the positioning module. This paper explores\nthe application of SLAM (Simultaneous Localization and Mapping) technology in\nthe context of automatic lane change behavior prediction and environment\nperception for autonomous vehicles. It discusses the limitations of traditional\npositioning methods, introduces SLAM technology, and compares LIDAR SLAM with\nvisual SLAM. Real-world examples from companies like Tesla, Waymo, and Mobileye\nshowcase the integration of AI-driven technologies, sensor fusion, and SLAM in\nautonomous driving systems. The paper then delves into the specifics of SLAM\nalgorithms, sensor technologies, and the importance of automatic lane changes\nin driving safety and efficiency. It highlights Tesla's recent update to its\nAutopilot system, which incorporates automatic lane change functionality using\nSLAM technology. The paper concludes by emphasizing the crucial role of SLAM in\nenabling accurate environment perception, positioning, and decision-making for\nautonomous vehicles, ultimately enhancing safety and driving experience.\n","authors":["Han Lei","Baoming Wang","Zuwei Shui","Peiyuan Yang","Penghao Liang"],"pdf_url":"https://arxiv.org/pdf/2404.04492v1.pdf","comment":null}]},"2024-04-09T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2404.06452v1","updated":"2024-04-09T16:53:52Z","published":"2024-04-09T16:53:52Z","title":"PAAM: A Framework for Coordinated and Priority-Driven Accelerator\n Management in ROS 2","summary":" This paper proposes a Priority-driven Accelerator Access Management (PAAM)\nframework for multi-process robotic applications built on top of the Robot\nOperating System (ROS) 2 middleware platform. The framework addresses the issue\nof predictable execution of time- and safety-critical callback chains that\nrequire hardware accelerators such as GPUs and TPUs. PAAM provides a standalone\nROS executor that acts as an accelerator resource server, arbitrating\naccelerator access requests from all other callbacks at the application layer.\nThis approach enables coordinated and priority-driven accelerator access\nmanagement in multi-process robotic systems. The framework design is directly\napplicable to all types of accelerators and enables granular control over how\nspecific chains access accelerators, making it possible to achieve predictable\nreal-time support for accelerators used by safety-critical callback chains\nwithout making changes to underlying accelerator device drivers. The paper\nshows that PAAM also offers a theoretical analysis that can upper bound the\nworst-case response time of safety-critical callback chains that necessitate\naccelerator access. This paper also demonstrates that complex robotic systems\nwith extensive accelerator usage that are integrated with PAAM may achieve up\nto a 91\\% reduction in end-to-end response time of their critical callback\nchains.\n","authors":["Daniel Enright","Yecheng Xiang","Hyunjong Choi","Hyoseung Kim"],"pdf_url":"https://arxiv.org/pdf/2404.06452v1.pdf","comment":"14 Pages, 14 Figures"},{"id":"http://arxiv.org/abs/2404.06442v1","updated":"2024-04-09T16:42:54Z","published":"2024-04-09T16:42:54Z","title":"QueSTMaps: Queryable Semantic Topological Maps for 3D Scene\n Understanding","summary":" Understanding the structural organisation of 3D indoor scenes in terms of\nrooms is often accomplished via floorplan extraction. Robotic tasks such as\nplanning and navigation require a semantic understanding of the scene as well.\nThis is typically achieved via object-level semantic segmentation. However,\nsuch methods struggle to segment out topological regions like \"kitchen\" in the\nscene. In this work, we introduce a two-step pipeline. First, we extract a\ntopological map, i.e., floorplan of the indoor scene using a novel\nmulti-channel occupancy representation. Then, we generate CLIP-aligned features\nand semantic labels for every room instance based on the objects it contains\nusing a self-attention transformer. Our language-topology alignment supports\nnatural language querying, e.g., a \"place to cook\" locates the \"kitchen\". We\noutperform the current state-of-the-art on room segmentation by ~20% and room\nclassification by ~12%. Our detailed qualitative analysis and ablation studies\nprovide insights into the problem of joint structural and semantic 3D scene\nunderstanding.\n","authors":["Yash Mehan","Kumaraditya Gupta","Rohit Jayanti","Anirudh Govil","Sourav Garg","Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.06442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06423v1","updated":"2024-04-09T16:14:03Z","published":"2024-04-09T16:14:03Z","title":"Deep Reinforcement Learning-Based Approach for a Single Vehicle\n Persistent Surveillance Problem with Fuel Constraints","summary":" This article presents a deep reinforcement learning-based approach to tackle\na persistent surveillance mission requiring a single unmanned aerial vehicle\ninitially stationed at a depot with fuel or time-of-flight constraints to\nrepeatedly visit a set of targets with equal priority. Owing to the vehicle's\nfuel or time-of-flight constraints, the vehicle must be regularly refueled, or\nits battery must be recharged at the depot. The objective of the problem is to\ndetermine an optimal sequence of visits to the targets that minimizes the\nmaximum time elapsed between successive visits to any target while ensuring\nthat the vehicle never runs out of fuel or charge. We present a deep\nreinforcement learning algorithm to solve this problem and present the results\nof numerical experiments that corroborate the effectiveness of this approach in\ncomparison with common-sense greedy heuristics.\n","authors":["Hritik Bana","Manav Mishra","Saswata Sarkar","Sujeevraja Sanjeevi","Sujit PB","Kaarthik Sundar"],"pdf_url":"https://arxiv.org/pdf/2404.06423v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2404.06413v1","updated":"2024-04-09T16:03:26Z","published":"2024-04-09T16:03:26Z","title":"Large Language Models to the Rescue: Deadlock Resolution in Multi-Robot\n Systems","summary":" Multi-agent robotic systems are prone to deadlocks in an obstacle environment\nwhere the system can get stuck away from its desired location under a smooth\nlow-level control policy. Without an external intervention, often in terms of a\nhigh-level command, it is not possible to guarantee that just a low-level\ncontrol policy can resolve such deadlocks. Utilizing the generalizability and\nlow data requirements of large language models (LLMs), this paper explores the\npossibility of using LLMs for deadlock resolution. We propose a hierarchical\ncontrol framework where an LLM resolves deadlocks by assigning a leader and\ndirection for the leader to move along. A graph neural network (GNN) based\nlow-level distributed control policy executes the assigned plan. We\nsystematically study various prompting techniques to improve LLM's performance\nin resolving deadlocks. In particular, as part of prompt engineering, we\nprovide in-context examples for LLMs. We conducted extensive experiments on\nvarious multi-robot environments with up to 15 agents and 40 obstacles. Our\nresults demonstrate that LLM-based high-level planners are effective in\nresolving deadlocks in MRS.\n","authors":["Kunal Garg","Jacob Arkin","Songyuan Zhang","Nicholas Roy","Chuchu Fan"],"pdf_url":"https://arxiv.org/pdf/2404.06413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10472v3","updated":"2024-04-09T15:02:30Z","published":"2023-06-18T04:26:12Z","title":"Towards Large-Scale Incremental Dense Mapping using Robot-centric\n Implicit Neural Representation","summary":" Large-scale dense mapping is vital in robotics, digital twins, and virtual\nreality. Recently, implicit neural mapping has shown remarkable reconstruction\nquality. However, incremental large-scale mapping with implicit neural\nrepresentations remains problematic due to low efficiency, limited video\nmemory, and the catastrophic forgetting phenomenon. To counter these\nchallenges, we introduce the Robot-centric Implicit Mapping (RIM) technique for\nlarge-scale incremental dense mapping. This method employs a hybrid\nrepresentation, encoding shapes with implicit features via a multi-resolution\nvoxel map and decoding signed distance fields through a shallow MLP. We\nadvocate for a robot-centric local map to boost model training efficiency and\ncurb the catastrophic forgetting issue. A decoupled scalable global map is\nfurther developed to archive learned features for reuse and maintain constant\nvideo memory consumption. Validation experiments demonstrate our method's\nexceptional quality, efficiency, and adaptability across diverse scales and\nscenes over advanced dense mapping methods using range sensors. Our system's\ncode will be accessible at https://github.com/HITSZ-NRSL/RIM.git.\n","authors":["Jianheng Liu","Haoyao Chen"],"pdf_url":"https://arxiv.org/pdf/2306.10472v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06356v1","updated":"2024-04-09T14:46:48Z","published":"2024-04-09T14:46:48Z","title":"Policy-Guided Diffusion","summary":" In many real-world settings, agents must learn from an offline dataset\ngathered by some prior behavior policy. Such a setting naturally leads to\ndistribution shift between the behavior policy and the target policy being\ntrained - requiring policy conservatism to avoid instability and overestimation\nbias. Autoregressive world models offer a different solution to this by\ngenerating synthetic, on-policy experience. However, in practice, model\nrollouts must be severely truncated to avoid compounding error. As an\nalternative, we propose policy-guided diffusion. Our method uses diffusion\nmodels to generate entire trajectories under the behavior distribution,\napplying guidance from the target policy to move synthetic experience further\non-policy. We show that policy-guided diffusion models a regularized form of\nthe target distribution that balances action likelihood under both the target\nand behavior policies, leading to plausible trajectories with high target\npolicy probability, while retaining a lower dynamics error than an offline\nworld model baseline. Using synthetic experience from policy-guided diffusion\nas a drop-in substitute for real data, we demonstrate significant improvements\nin performance across a range of standard offline reinforcement learning\nalgorithms and environments. Our approach provides an effective alternative to\nautoregressive offline world models, opening the door to the controllable\ngeneration of synthetic training data.\n","authors":["Matthew Thomas Jackson","Michael Tryfan Matthews","Cong Lu","Benjamin Ellis","Shimon Whiteson","Jakob Foerster"],"pdf_url":"https://arxiv.org/pdf/2404.06356v1.pdf","comment":"Previously at the NeurIPS 2023 Workshop on Robot Learning"},{"id":"http://arxiv.org/abs/2404.06352v1","updated":"2024-04-09T14:43:19Z","published":"2024-04-09T14:43:19Z","title":"DaF-BEVSeg: Distortion-aware Fisheye Camera based Bird's Eye View\n Segmentation with Occlusion Reasoning","summary":" Semantic segmentation is an effective way to perform scene understanding.\nRecently, segmentation in 3D Bird's Eye View (BEV) space has become popular as\nits directly used by drive policy. However, there is limited work on BEV\nsegmentation for surround-view fisheye cameras, commonly used in commercial\nvehicles. As this task has no real-world public dataset and existing synthetic\ndatasets do not handle amodal regions due to occlusion, we create a synthetic\ndataset using the Cognata simulator comprising diverse road types, weather, and\nlighting conditions. We generalize the BEV segmentation to work with any camera\nmodel; this is useful for mixing diverse cameras. We implement a baseline by\napplying cylindrical rectification on the fisheye images and using a standard\nLSS-based BEV segmentation model. We demonstrate that we can achieve better\nperformance without undistortion, which has the adverse effects of increased\nruntime due to pre-processing, reduced field-of-view, and resampling artifacts.\nFurther, we introduce a distortion-aware learnable BEV pooling strategy that is\nmore effective for the fisheye cameras. We extend the model with an occlusion\nreasoning module, which is critical for estimating in BEV space. Qualitative\nperformance of DaF-BEVSeg is showcased in the video at\nhttps://streamable.com/ge4v51.\n","authors":["Senthil Yogamani","David Unger","Venkatraman Narayanan","Varun Ravi Kumar"],"pdf_url":"https://arxiv.org/pdf/2404.06352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06345v1","updated":"2024-04-09T14:33:16Z","published":"2024-04-09T14:33:16Z","title":"AgentsCoDriver: Large Language Model Empowered Collaborative Driving\n with Lifelong Learning","summary":" Connected and autonomous driving is developing rapidly in recent years.\nHowever, current autonomous driving systems, which are primarily based on\ndata-driven approaches, exhibit deficiencies in interpretability,\ngeneralization, and continuing learning capabilities. In addition, the\nsingle-vehicle autonomous driving systems lack of the ability of collaboration\nand negotiation with other vehicles, which is crucial for the safety and\nefficiency of autonomous driving systems. In order to address these issues, we\nleverage large language models (LLMs) to develop a novel framework,\nAgentsCoDriver, to enable multiple vehicles to conduct collaborative driving.\nAgentsCoDriver consists of five modules: observation module, reasoning engine,\ncognitive memory module, reinforcement reflection module, and communication\nmodule. It can accumulate knowledge, lessons, and experiences over time by\ncontinuously interacting with the environment, thereby making itself capable of\nlifelong learning. In addition, by leveraging the communication module,\ndifferent agents can exchange information and realize negotiation and\ncollaboration in complex traffic environments. Extensive experiments are\nconducted and show the superiority of AgentsCoDriver.\n","authors":["Senkang Hu","Zhengru Fang","Zihan Fang","Xianhao Chen","Yuguang Fang"],"pdf_url":"https://arxiv.org/pdf/2404.06345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06340v1","updated":"2024-04-09T14:25:31Z","published":"2024-04-09T14:25:31Z","title":"Experimental System Design of an Active Fault-Tolerant Quadrotor","summary":" Quadrotors have gained popularity over the last decade, aiding humans in\ncomplex tasks such as search and rescue, mapping and exploration. Despite their\nmechanical simplicity and versatility compared to other types of aerial\nvehicles, they remain vulnerable to rotor failures. In this paper, we propose\nan algorithmic and mechanical approach to addressing the quadrotor\nfault-tolerant problem in case of rotor failures. First, we present a\nfault-tolerant detection and control scheme that includes various attitude\nerror metrics. The scheme transitions to a fault-tolerant control mode by\nsurrendering the yaw control. Subsequently, to ensure compatibility with\nplatform sensing constraints, we investigate the relationship between\nvariations in robot rotational drag, achieved through a modular mechanical\ndesign appendage, resulting in yaw rates within sensor limits. This analysis\noffers a platform-agnostic framework for designing more reliable and robust\nquadrotors in the event of rotor failures. Extensive experimental results\nvalidate the proposed approach providing insights into successfully designing a\ncost-effective quadrotor capable of fault-tolerant control. The overall design\nenhances safety in scenarios of faulty rotors, without the need for additional\nsensors or computational resources.\n","authors":["Jennifer Yeom","Roshan Balu T M B","Guanrui Li","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2404.06340v1.pdf","comment":"Accepted to ICUAS 2024"},{"id":"http://arxiv.org/abs/2308.11373v2","updated":"2024-04-09T14:01:16Z","published":"2023-08-22T11:56:15Z","title":"Fast and Adaptive Multi-agent Planning under Collaborative Temporal\n Logic Tasks via Poset Products","summary":" Efficient coordination and planning is essential for large-scale multi-agent\nsystems that collaborate in a shared dynamic environment. Heuristic search\nmethods or learning-based approaches often lack the guarantee on correctness\nand performance. Moreover, when the collaborative tasks contain both spatial\nand temporal requirements, e.g., as Linear Temporal Logic (LTL) formulas,\nformal methods provide a verifiable framework for task planning. However, since\nthe planning complexity grows exponentially with the number of agents and the\nlength of the task formula, existing studies are mostly limited to small\nartificial cases. To address this issue, a new planning paradigm is proposed in\nthis work for system-wide temporal task formulas that are released online and\ncontinually. It avoids two common bottlenecks in the traditional methods, i.e.,\n(i) the direct translation of the complete task formula to the associated\nB\\\"uchi automaton; and (ii) the synchronized product between the B\\\"uchi\nautomaton and the transition models of all agents. Instead, an adaptive\nplanning algorithm is proposed that computes the product of relaxed\npartially-ordered sets (R-posets) on-the-fly, and assigns these subtasks to the\nagents subject to the ordering constraints. It is shown that the first valid\nplan can be derived with a polynomial time and memory complexity w.r.t. the\nsystem size and the formula length. Our method can take into account task\nformulas with a length of more than 400 and a fleet with more than $400$\nagents, while most existing methods fail at the formula length of 25 within a\nreasonable duration. The proposed method is validated on large fleets of\nservice robots in both simulation and hardware experiments.\n","authors":["Zesen Liu","Meng Guo","Weimin Bao","Zhongkui Li"],"pdf_url":"https://arxiv.org/pdf/2308.11373v2.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.14265v2","updated":"2024-04-09T13:33:30Z","published":"2023-09-25T16:23:49Z","title":"Industrial Application of 6D Pose Estimation for Robotic Manipulation in\n Automotive Internal Logistics","summary":" Despite the advances in robotics a large proportion of the of parts handling\ntasks in the automotive industry's internal logistics are not automated but\nstill performed by humans. A key component to competitively automate these\nprocesses is a 6D pose estimation that can handle a large number of different\nparts, is adaptable to new parts with little manual effort, and is sufficiently\naccurate and robust with respect to industry requirements. In this context, the\nquestion arises as to the current status quo with respect to these measures. To\naddress this we built a representative 6D pose estimation pipeline with\nstate-of-the-art components from economically scalable real to synthetic data\ngeneration to pose estimators and evaluated it on automotive parts with regards\nto a realistic sequencing process. We found that using the data generation\napproaches, the performance of the trained 6D pose estimators are promising,\nbut do not meet industry requirements. We reveal that the reason for this is\nthe inability of the estimators to provide reliable uncertainties for their\nposes, rather than the ability of to provide sufficiently accurate poses. In\nthis context we further analyzed how RGB- and RGB-D-based approaches compare\nagainst this background and show that they are differently vulnerable to the\ndomain gap induced by synthetic data.\n","authors":["Philipp Quentin","Dino Knoll","Daniel Goehring"],"pdf_url":"https://arxiv.org/pdf/2309.14265v2.pdf","comment":"Accepted for publication at IEEE International Conference on\n Automation Science and Engineering (CASE 2023)"},{"id":"http://arxiv.org/abs/2404.06288v1","updated":"2024-04-09T13:15:23Z","published":"2024-04-09T13:15:23Z","title":"Statistical Modelling of Driving Scenarios in Road Traffic using Fleet\n Data of Production Vehicles","summary":" Ensuring the safety of road vehicles at an acceptable level requires the\nabsence of any unreasonable risk arising from all potential hazards linked to\nthe intended au-tomated driving function and its implementation. The assurance\nthat there are no unreasonable risks stemming from hazardous behaviours\nassociated to functional insufficiencies is denoted as safety of intended\nfunctionality (SOTIF), a concept outlined in the ISO 21448 standard. In this\ncontext, the acquisition of real driving data is considered essential for the\nverification and validation. For this purpose, we are currently developing a\nmethod with which data collect-ed representatively from production vehicles can\nbe modelled into a knowledge-based system in the future. A system that\nrepresents the probabilities of occur-rence of concrete driving scenarios over\nthe statistical population of road traffic and makes them usable. The method\nincludes the qualitative and quantitative ab-straction of the drives recorded\nby the sensors in the vehicles, the possibility of subsequent wireless\ntransmission of the abstracted data from the vehicles and the derivation of the\ndistributions and correlations of scenario parameters. This paper provides a\nsummary of the research project and outlines its central idea. To this end,\namong other things, the needs for statistical information and da-ta from road\ntraffic are elaborated from ISO 21448, the current state of research is\naddressed, and methodical aspects are discussed.\n","authors":["Christian Reichenbächer","Jochen Hipp","Oliver Bringmann"],"pdf_url":"https://arxiv.org/pdf/2404.06288v1.pdf","comment":"12 pages, 4 figures, the article has been accepted for publication\n and presentation during the 9th International ATZ Conference on Automated\n Driving 2024"},{"id":"http://arxiv.org/abs/2403.18236v3","updated":"2024-04-09T12:53:00Z","published":"2024-03-27T03:53:30Z","title":"Multi-AGV Path Planning Method via Reinforcement Learning and Particle\n Filters","summary":" The Reinforcement Learning (RL) algorithm, renowned for its robust learning\ncapability and search stability, has garnered significant attention and found\nextensive application in Automated Guided Vehicle (AGV) path planning. However,\nRL planning algorithms encounter challenges stemming from the substantial\nvariance of neural networks caused by environmental instability and significant\nfluctuations in system structure. These challenges manifest in slow convergence\nspeed and low learning efficiency. To tackle this issue, this paper presents\nthe Particle Filter-Double Deep Q-Network (PF-DDQN) approach, which\nincorporates the Particle Filter (PF) into multi-AGV reinforcement learning\npath planning. The PF-DDQN method leverages the imprecise weight values of the\nnetwork as state values to formulate the state space equation. Through the\niterative fusion process of neural networks and particle filters, the DDQN\nmodel is optimized to acquire the optimal true weight values, thus enhancing\nthe algorithm's efficiency. The proposed method's effectiveness and superiority\nare validated through numerical simulations. Overall, the simulation results\ndemonstrate that the proposed algorithm surpasses the traditional DDQN\nalgorithm in terms of path planning superiority and training time indicators by\n92.62% and 76.88%, respectively. In conclusion, the PF-DDQN method addresses\nthe challenges encountered by RL planning algorithms in AGV path planning. By\nintegrating the Particle Filter and optimizing the DDQN model, the proposed\nmethod achieves enhanced efficiency and outperforms the traditional DDQN\nalgorithm in terms of path planning superiority and training time indicators.\n","authors":["Shao Shuo"],"pdf_url":"https://arxiv.org/pdf/2403.18236v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06261v1","updated":"2024-04-09T12:34:28Z","published":"2024-04-09T12:34:28Z","title":"Playing to Vision Foundation Model's Strengths in Stereo Matching","summary":" Stereo matching has become a key technique for 3D environment perception in\nintelligent vehicles. For a considerable time, convolutional neural networks\n(CNNs) have remained the mainstream choice for feature extraction in this\ndomain. Nonetheless, there is a growing consensus that the existing paradigm\nshould evolve towards vision foundation models (VFM), particularly those\ndeveloped based on vision Transformers (ViTs) and pre-trained through\nself-supervision on extensive, unlabeled datasets. While VFMs are adept at\nextracting informative, general-purpose visual features, specifically for dense\nprediction tasks, their performance often lacks in geometric vision tasks. This\nstudy serves as the first exploration of a viable approach for adapting VFMs to\nstereo matching. Our ViT adapter, referred to as ViTAS, is constructed upon\nthree types of modules: spatial differentiation, patch attention fusion, and\ncross-attention. The first module initializes feature pyramids, while the\nlatter two aggregate stereo and multi-scale contextual information into\nfine-grained features, respectively. ViTAStereo, which combines ViTAS with cost\nvolume-based stereo matching back-end processes, achieves the top rank on the\nKITTI Stereo 2012 dataset and outperforms the second-best network StereoBase by\napproximately 7.9% in terms of the percentage of error pixels, with a tolerance\nof 3 pixels. Additional experiments across diverse scenarios further\ndemonstrate its superior generalizability compared to all other\nstate-of-the-art approaches. We believe this new paradigm will pave the way for\nthe next generation of stereo matching networks.\n","authors":["Chuang-Wei Liu","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2404.06261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06256v1","updated":"2024-04-09T12:29:16Z","published":"2024-04-09T12:29:16Z","title":"Label-Efficient 3D Object Detection For Road-Side Units","summary":" Occlusion presents a significant challenge for safety-critical applications\nsuch as autonomous driving. Collaborative perception has recently attracted a\nlarge research interest thanks to the ability to enhance the perception of\nautonomous vehicles via deep information fusion with intelligent roadside units\n(RSU), thus minimizing the impact of occlusion. While significant advancement\nhas been made, the data-hungry nature of these methods creates a major hurdle\nfor their real-world deployment, particularly due to the need for annotated RSU\ndata. Manually annotating the vast amount of RSU data required for training is\nprohibitively expensive, given the sheer number of intersections and the effort\ninvolved in annotating point clouds. We address this challenge by devising a\nlabel-efficient object detection method for RSU based on unsupervised object\ndiscovery. Our paper introduces two new modules: one for object discovery based\non a spatial-temporal aggregation of point clouds, and another for refinement.\nFurthermore, we demonstrate that fine-tuning on a small portion of annotated\ndata allows our object discovery models to narrow the performance gap with, or\neven surpass, fully supervised models. Extensive experiments are carried out in\nsimulated and real-world datasets to evaluate our method.\n","authors":["Minh-Quan Dao","Holger Caesar","Julie Stephany Berrio","Mao Shan","Stewart Worrall","Vincent Frémont","Ezio Malis"],"pdf_url":"https://arxiv.org/pdf/2404.06256v1.pdf","comment":"IV 2024"},{"id":"http://arxiv.org/abs/2404.06229v1","updated":"2024-04-09T11:40:37Z","published":"2024-04-09T11:40:37Z","title":"Towards Autonomous Driving with Small-Scale Cars: A Survey of Recent\n Development","summary":" While engaging with the unfolding revolution in autonomous driving, a\nchallenge presents itself, how can we effectively raise awareness within\nsociety about this transformative trend? While full-scale autonomous driving\nvehicles often come with a hefty price tag, the emergence of small-scale car\nplatforms offers a compelling alternative. These platforms not only serve as\nvaluable educational tools for the broader public and young generations but\nalso function as robust research platforms, contributing significantly to the\nongoing advancements in autonomous driving technology. This survey outlines\nvarious small-scale car platforms, categorizing them and detailing the research\nadvancements accomplished through their usage. The conclusion provides\nproposals for promising future directions in the field.\n","authors":["Dianzhao Li","Paul Auerbach","Ostap Okhrin"],"pdf_url":"https://arxiv.org/pdf/2404.06229v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06179v1","updated":"2024-04-09T10:03:35Z","published":"2024-04-09T10:03:35Z","title":"AI-MOLE: Autonomous Iterative Motion Learning for Unknown Nonlinear\n Dynamics with Extensive Experimental Validation","summary":" This work proposes Autonomous Iterative Motion Learning (AI-MOLE), a method\nthat enables systems with unknown, nonlinear dynamics to autonomously learn to\nsolve reference tracking tasks. The method iteratively applies an input\ntrajectory to the unknown dynamics, trains a Gaussian process model based on\nthe experimental data, and utilizes the model to update the input trajectory\nuntil desired tracking performance is achieved. Unlike existing approaches, the\nproposed method determines necessary parameters automatically, i.e., AI-MOLE\nworks plug-and-play and without manual parameter tuning. Furthermore, AI-MOLE\nonly requires input/output information, but can also exploit available state\ninformation to accelerate learning.\n While other approaches are typically only validated in simulation or on a\nsingle real-world testbed using manually tuned parameters, we present the\nunprecedented result of validating the proposed method on three different\nreal-world robots and a total of nine different reference tracking tasks\nwithout requiring any a priori model information or manual parameter tuning.\nOver all systems and tasks, AI-MOLE rapidly learns to track the references\nwithout requiring any manual parameter tuning at all, even if only input/output\ninformation is available.\n","authors":["Michael Meindl","Simon Bachhuber","Thomas Seel"],"pdf_url":"https://arxiv.org/pdf/2404.06179v1.pdf","comment":"9 pages, 6 figures, journal article"},{"id":"http://arxiv.org/abs/2404.06178v1","updated":"2024-04-09T09:58:59Z","published":"2024-04-09T09:58:59Z","title":"Resilient Movement Planning for Continuum Robots","summary":" The paper presents an experimental study of resilient path planning for\ncon-tinuum robots taking into account the multi-objective optimisation problem.\nTo do this, we used two well-known algorithms, namely Genetic algorithm and A*\nalgorithm, for path planning and the Analytical Hierarchy Process al-gorithm\nfor paths evaluation. In our experiment Analytical Hierarchy Process algorithm\nconsiders four different criteria, i.e. distance, motors damage, me-chanical\ndamage and accuracy each considered to contribute to the resilience of a\ncontinuum robot. The use of different criteria is necessary to increasing the\ntime to maintenance operations of the robot. The experiment shows that on the\none hand both algorithms can be used in combination with Analytical Hierarchy\nProcess algorithm for multi criteria path-planning, while Genetic algorithm\nshows superior performance in the comparison of the two algo-rithms.\n","authors":["Oxana Shamilyan","Ievgen Kabin","Zoya Dyka","Peter Langendoerfer"],"pdf_url":"https://arxiv.org/pdf/2404.06178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06171v1","updated":"2024-04-09T09:50:02Z","published":"2024-04-09T09:50:02Z","title":"Intelligence and Motion Models of Continuum Robots: an Overview","summary":" Many technical solutions are bio-inspired. Octopus-inspired robotic arms\nbelong to continuum robots which are used in minimally invasive surgery or for\ntechnical system restoration in areas difficult-toaccess. Continuum robot\nmissions are bounded with their motions, whereby the motion of the robots is\ncontrolled by humans via wireless communication. In case of a lost connection,\nrobot autonomy is required. Distributed control and distributed decision-making\nmechanisms based on artificial intelligence approaches can be a promising\nsolution to achieve autonomy of technical systems and to increase their\nresilience. However these methods are not well investigated yet. Octopuses are\nthe living example of natural distributed intelligence but their learning and\ndecision-making mechanisms are also not fully investigated and understood yet.\nOur major interest is investigating mechanisms of Distributed Artificial\nIntelligence as a basis for improving resilience of complex systems. We decided\nto use a physical continuum robot prototype that is able to perform some basic\nmovements for our research. The idea is to research how a technical system can\nbe empowered to combine movements into sequences of motions by itself. For the\nexperimental investigations a suitable physical prototype has to be selected,\nits motion control has to be implemented and automated. In this paper, we give\nan overview combining different fields of research, such as Distributed\nArtificial Intelligence and continuum robots based on 98 publications. We\nprovide a detailed description of the basic motion control models of continuum\nrobots based on the literature reviewed, discuss different aspects of autonomy\nand give an overview of physical prototypes of continuum robots.\n","authors":["Oxana Shamilyan","Ievgen Kabin","Zoya Dyka","Oleksandr Sudakov","Andrii Cherninskyi","Marcin Brzozowski","Peter Langendoerfer"],"pdf_url":"https://arxiv.org/pdf/2404.06171v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06159v1","updated":"2024-04-09T09:32:00Z","published":"2024-04-09T09:32:00Z","title":"Distributed Artificial Intelligence as a Means to Achieve\n Self-X-Functions for Increasing Resilience: the First Steps","summary":" Using sensors as a means to achieve self-awareness and artificial\nintelligence for decision-making, may be a way to make complex systems\nself-adaptive, autonomous and resilient. Investigating the combination of\ndistributed artificial intelligence methods and bio-inspired robotics can\nprovide results that will be helpful for implementing autonomy of such robots\nand other complex systems. In this paper, we describe Distributed Artificial\nIntelligence application area, the most common examples of continuum robots and\nprovide a description of our first steps towards implementing distributed\ncontrol.\n","authors":["Oxana Shamilyan","Ievgen Kabin","Zoya Dyka","Peter Langendoerfer"],"pdf_url":"https://arxiv.org/pdf/2404.06159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06155v1","updated":"2024-04-09T09:28:05Z","published":"2024-04-09T09:28:05Z","title":"Efficient and Robust Point Cloud Registration via Heuristics-guided\n Parameter Search","summary":" Estimating the rigid transformation with 6 degrees of freedom based on a\nputative 3D correspondence set is a crucial procedure in point cloud\nregistration. Existing correspondence identification methods usually lead to\nlarge outlier ratios ($>$ 95 $\\%$ is common), underscoring the significance of\nrobust registration methods. Many researchers turn to parameter search-based\nstrategies (e.g., Branch-and-Bround) for robust registration. Although related\nmethods show high robustness, their efficiency is limited to the\nhigh-dimensional search space. This paper proposes a heuristics-guided\nparameter search strategy to accelerate the search while maintaining high\nrobustness. We first sample some correspondences (i.e., heuristics) and then\njust need to sequentially search the feasible regions that make each sample an\ninlier. Our strategy largely reduces the search space and can guarantee\naccuracy with only a few inlier samples, therefore enjoying an excellent\ntrade-off between efficiency and robustness. Since directly parameterizing the\n6-dimensional nonlinear feasible region for efficient search is intractable, we\nconstruct a three-stage decomposition pipeline to reparameterize the feasible\nregion, resulting in three lower-dimensional sub-problems that are easily\nsolvable via our strategy. Besides reducing the searching dimension, our\ndecomposition enables the leverage of 1-dimensional interval stabbing at all\nthree stages for searching acceleration. Moreover, we propose a valid sampling\nstrategy to guarantee our sampling effectiveness, and a compatibility\nverification setup to further accelerate our search. Extensive experiments on\nboth simulated and real-world datasets demonstrate that our approach exhibits\ncomparable robustness with state-of-the-art methods while achieving a\nsignificant efficiency boost.\n","authors":["Tianyu Huang","Haoang Li","Liangzu Peng","Yinlong Liu","Yun-Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06155v1.pdf","comment":"21 pages, 16 figures. Accepted to IEEE Transactions on Pattern\n Analysis and Machine Intelligence, 2024"},{"id":"http://arxiv.org/abs/2404.00915v2","updated":"2024-04-09T09:16:29Z","published":"2024-04-01T04:43:39Z","title":"Scalable 3D Registration via Truncated Entry-wise Absolute Residuals","summary":" Given an input set of $3$D point pairs, the goal of outlier-robust $3$D\nregistration is to compute some rotation and translation that align as many\npoint pairs as possible. This is an important problem in computer vision, for\nwhich many highly accurate approaches have been recently proposed. Despite\ntheir impressive performance, these approaches lack scalability, often\noverflowing the $16$GB of memory of a standard laptop to handle roughly\n$30,000$ point pairs. In this paper, we propose a $3$D registration approach\nthat can process more than ten million ($10^7$) point pairs with over $99\\%$\nrandom outliers. Moreover, our method is efficient, entails low memory costs,\nand maintains high accuracy at the same time. We call our method TEAR, as it\ninvolves minimizing an outlier-robust loss that computes Truncated Entry-wise\nAbsolute Residuals. To minimize this loss, we decompose the original\n$6$-dimensional problem into two subproblems of dimensions $3$ and $2$,\nrespectively, solved in succession to global optimality via a customized\nbranch-and-bound method. While branch-and-bound is often slow and unscalable,\nthis does not apply to TEAR as we propose novel bounding functions that are\ntight and computationally efficient. Experiments on various datasets are\nconducted to validate the scalability and efficiency of our method.\n","authors":["Tianyu Huang","Liangzu Peng","René Vidal","Yun-Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2404.00915v2.pdf","comment":"24 pages, 12 figures. Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.06129v1","updated":"2024-04-09T08:56:43Z","published":"2024-04-09T08:56:43Z","title":"Adaptable Recovery Behaviors in Robotics: A Behavior Trees and Motion\n Generators(BTMG) Approach for Failure Management","summary":" In dynamic operational environments, particularly in collaborative robotics,\nthe inevitability of failures necessitates robust and adaptable recovery\nstrategies. Traditional automated recovery strategies, while effective for\npredefined scenarios, often lack the flexibility required for on-the-fly task\nmanagement and adaptation to expected failures. Addressing this gap, we propose\na novel approach that models recovery behaviors as adaptable robotic skills,\nleveraging the Behavior Trees and Motion Generators~(BTMG) framework for policy\nrepresentation. This approach distinguishes itself by employing reinforcement\nlearning~(RL) to dynamically refine recovery behavior parameters, enabling a\ntailored response to a wide array of failure scenarios with minimal human\nintervention. We assess our methodology through a series of progressively\nchallenging scenarios within a peg-in-a-hole task, demonstrating the approach's\neffectiveness in enhancing operational efficiency and task success rates in\ncollaborative robotics settings. We validate our approach using a dual-arm KUKA\nrobot.\n","authors":["Faseeh Ahmad","Matthias Mayr","Sulthan Suresh-Fazeela","Volker Kreuger"],"pdf_url":"https://arxiv.org/pdf/2404.06129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06124v1","updated":"2024-04-09T08:49:01Z","published":"2024-04-09T08:49:01Z","title":"Hierarchical Insights: Exploiting Structural Similarities for Reliable\n 3D Semantic Segmentation","summary":" Safety-critical applications like autonomous driving call for robust 3D\nenvironment perception algorithms which can withstand highly diverse and\nambiguous surroundings. The predictive performance of any classification model\nstrongly depends on the underlying dataset and the prior knowledge conveyed by\nthe annotated labels. While the labels provide a basis for the learning\nprocess, they usually fail to represent inherent relations between the classes\n- representations, which are a natural element of the human perception system.\nWe propose a training strategy which enables a 3D LiDAR semantic segmentation\nmodel to learn structural relationships between the different classes through\nabstraction. We achieve this by implicitly modeling those relationships through\na learning rule for hierarchical multi-label classification (HMC). With a\ndetailed analysis we show, how this training strategy not only improves the\nmodel's confidence calibration, but also preserves additional information for\ndownstream tasks like fusion, prediction and planning.\n","authors":["Mariella Dreissig","Florian Piewak","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2404.06124v1.pdf","comment":"submitted to IROS 2024"},{"id":"http://arxiv.org/abs/2403.13358v2","updated":"2024-04-09T07:55:41Z","published":"2024-03-20T07:36:43Z","title":"GeRM: A Generalist Robotic Model with Mixture-of-experts for Quadruped\n Robot","summary":" Multi-task robot learning holds significant importance in tackling diverse\nand complex scenarios. However, current approaches are hindered by performance\nissues and difficulties in collecting training datasets. In this paper, we\npropose GeRM (Generalist Robotic Model). We utilize offline reinforcement\nlearning to optimize data utilization strategies to learn from both\ndemonstrations and sub-optimal data, thus surpassing the limitations of human\ndemonstrations. Thereafter, we employ a transformer-based VLA network to\nprocess multi-modal inputs and output actions. By introducing the\nMixture-of-Experts structure, GeRM allows faster inference speed with higher\nwhole model capacity, and thus resolves the issue of limited RL parameters,\nenhancing model performance in multi-task learning while controlling\ncomputational costs. Through a series of experiments, we demonstrate that GeRM\noutperforms other methods across all tasks, while also validating its\nefficiency in both training and inference processes. Additionally, we uncover\nits potential to acquire emergent skills. Additionally, we contribute the\nQUARD-Auto dataset, collected automatically to support our training approach\nand foster advancements in multi-task quadruped robot learning. This work\npresents a new paradigm for reducing the cost of collecting robot data and\ndriving progress in the multi-task learning community. You can reach our\nproject and video through the link: https://songwxuan.github.io/GeRM/ .\n","authors":["Wenxuan Song","Han Zhao","Pengxiang Ding","Can Cui","Shangke Lyu","Yaning Fan","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.13358v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06089v1","updated":"2024-04-09T07:48:49Z","published":"2024-04-09T07:48:49Z","title":"EVE: Enabling Anyone to Train Robot using Augmented Reality","summary":" The increasing affordability of robot hardware is accelerating the\nintegration of robots into everyday activities. However, training a robot to\nautomate a task typically requires physical robots and expensive demonstration\ndata from trained human annotators. Consequently, only those with access to\nphysical robots produce demonstrations to train robots. To mitigate this issue,\nwe introduce EVE, an iOS app that enables everyday users to train robots using\nintuitive augmented reality visualizations without needing a physical robot.\nWith EVE, users can collect demonstrations by specifying waypoints with their\nhands, visually inspecting the environment for obstacles, modifying existing\nwaypoints, and verifying collected trajectories. In a user study ($N=14$,\n$D=30$) consisting of three common tabletop tasks, EVE outperformed three\nstate-of-the-art interfaces in success rate and was comparable to kinesthetic\nteaching-physically moving a real robot-in completion time, usability, motion\nintent communication, enjoyment, and preference ($mean_{p}=0.30$). We conclude\nby enumerating limitations and design considerations for future AR-based\ndemonstration collection systems for robotics.\n","authors":["Jun Wang","Chun-Cheng Chang","Jiafei Duan","Dieter Fox","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.06089v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2402.01370v3","updated":"2024-04-09T07:38:58Z","published":"2024-02-02T12:44:39Z","title":"CC-VPSTO: Chance-Constrained Via-Point-based Stochastic Trajectory\n Optimisation for Safe and Efficient Online Robot Motion Planning","summary":" Safety in the face of uncertainty is a key challenge in robotics. We\nintroduce a real-time capable framework to generate safe and task-efficient\nrobot motions for stochastic control problems. We frame this as a\nchance-constrained optimisation problem constraining the probability of the\ncontrolled system to violate a safety constraint to be below a set threshold.\nTo estimate this probability we propose a Monte--Carlo approximation. We\nsuggest several ways to construct the problem given a fixed number of\nuncertainty samples, such that it is a reliable over-approximation of the\noriginal problem, i.e. any solution to the sample-based problem adheres to the\noriginal chance-constraint with high confidence. To solve the resulting\nproblem, we integrate it into our motion planner VP-STO and name the enhanced\nframework Chance-Constrained (CC)-VPSTO. The strengths of our approach lie in\ni) its generality, without assumptions on the underlying uncertainty\ndistribution, system dynamics, cost function, or the form of inequality\nconstraints; and ii) its applicability to MPC-settings. We demonstrate the\nvalidity and efficiency of our approach on both simulation and real-world robot\nexperiments.\n","authors":["Lara Brudermüller","Guillaume Berger","Julius Jankowski","Raunak Bhattacharyya","Raphaël Jungers","Nick Hawes"],"pdf_url":"https://arxiv.org/pdf/2402.01370v3.pdf","comment":"17 pages, 11 figures, submitted to IEEE Transactions on Robotics"},{"id":"http://arxiv.org/abs/2404.06050v1","updated":"2024-04-09T06:27:35Z","published":"2024-04-09T06:27:35Z","title":"Incremental Joint Learning of Depth, Pose and Implicit Scene\n Representation on Monocular Camera in Large-scale Scenes","summary":" Dense scene reconstruction for photo-realistic view synthesis has various\napplications, such as VR/AR, autonomous vehicles. However, most existing\nmethods have difficulties in large-scale scenes due to three core challenges:\n\\textit{(a) inaccurate depth input.} Accurate depth input is impossible to get\nin real-world large-scale scenes. \\textit{(b) inaccurate pose estimation.} Most\nexisting approaches rely on accurate pre-estimated camera poses. \\textit{(c)\ninsufficient scene representation capability.} A single global radiance field\nlacks the capacity to effectively scale to large-scale scenes. To this end, we\npropose an incremental joint learning framework, which can achieve accurate\ndepth, pose estimation, and large-scale scene reconstruction. A vision\ntransformer-based network is adopted as the backbone to enhance performance in\nscale information estimation. For pose estimation, a feature-metric bundle\nadjustment (FBA) method is designed for accurate and robust camera tracking in\nlarge-scale scenes. In terms of implicit scene representation, we propose an\nincremental scene representation method to construct the entire large-scale\nscene as multiple local radiance fields to enhance the scalability of 3D scene\nrepresentation. Extended experiments have been conducted to demonstrate the\neffectiveness and accuracy of our method in depth estimation, pose estimation,\nand large-scale scene reconstruction.\n","authors":["Tianchen Deng","Nailin Wang","Chongdi Wang","Shenghai Yuan","Jingchuan Wang","Danwei Wang","Weidong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06012v1","updated":"2024-04-09T04:41:05Z","published":"2024-04-09T04:41:05Z","title":"Diffusion-Based Point Cloud Super-Resolution for mmWave Radar Data","summary":" The millimeter-wave radar sensor maintains stable performance under adverse\nenvironmental conditions, making it a promising solution for all-weather\nperception tasks, such as outdoor mobile robotics. However, the radar point\nclouds are relatively sparse and contain massive ghost points, which greatly\nlimits the development of mmWave radar technology. In this paper, we propose a\nnovel point cloud super-resolution approach for 3D mmWave radar data, named\nRadar-diffusion. Our approach employs the diffusion model defined by\nmean-reverting stochastic differential equations(SDE). Using our proposed new\nobjective function with supervision from corresponding LiDAR point clouds, our\napproach efficiently handles radar ghost points and enhances the sparse mmWave\nradar point clouds to dense LiDAR-like point clouds. We evaluate our approach\non two different datasets, and the experimental results show that our method\noutperforms the state-of-the-art baseline methods in 3D radar super-resolution\ntasks. Furthermore, we demonstrate that our enhanced radar point cloud is\ncapable of downstream radar point-based registration tasks.\n","authors":["Kai Luan","Chenghao Shi","Neng Wang","Yuwei Cheng","Huimin Lu","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05332v5","updated":"2024-04-09T04:38:33Z","published":"2023-12-08T19:33:22Z","title":"MPC-Inspired Reinforcement Learning for Verifiable Model-Free Control","summary":" In this paper, we introduce a new class of parameterized controllers, drawing\ninspiration from Model Predictive Control (MPC). The controller resembles a\nQuadratic Programming (QP) solver of a linear MPC problem, with the parameters\nof the controller being trained via Deep Reinforcement Learning (DRL) rather\nthan derived from system models. This approach addresses the limitations of\ncommon controllers with Multi-Layer Perceptron (MLP) or other general neural\nnetwork architecture used in DRL, in terms of verifiability and performance\nguarantees, and the learned controllers possess verifiable properties like\npersistent feasibility and asymptotic stability akin to MPC. On the other hand,\nnumerical examples illustrate that the proposed controller empirically matches\nMPC and MLP controllers in terms of control performance and has superior\nrobustness against modeling uncertainty and noises. Furthermore, the proposed\ncontroller is significantly more computationally efficient compared to MPC and\nrequires fewer parameters to learn than MLP controllers. Real-world experiments\non vehicle drift maneuvering task demonstrate the potential of these\ncontrollers for robotics and other demanding control tasks.\n","authors":["Yiwen Lu","Zishuo Li","Yihan Zhou","Na Li","Yilin Mo"],"pdf_url":"https://arxiv.org/pdf/2312.05332v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.06071v4","updated":"2024-04-09T04:16:15Z","published":"2022-02-12T14:04:44Z","title":"Deadlock Resolution and Recursive Feasibility in MPC-based Multi-robot\n Trajectory Generation","summary":" Online collision-free trajectory generation within a shared workspace is\nfundamental for most multi-robot applications. However, many widely-used\nmethods based on model predictive control (MPC) lack theoretical guarantees on\nthe feasibility of underlying optimization. Furthermore, when applied in a\ndistributed manner without a central coordinator, deadlocks often occur where\nseveral robots block each other indefinitely. Whereas heuristic methods such as\nintroducing random perturbations exist, no profound analyses are given to\nvalidate these measures. Towards this end, we propose a systematic method\ncalled infinite-horizon model predictive control with deadlock resolution. The\nMPC is formulated as a convex optimization over the proposed modified buffered\nVoronoi with warning band. Based on this formulation, the condition of\ndeadlocks is formally analyzed and proven to be analogous to a force\nequilibrium. A detection-resolution scheme is proposed, which can effectively\ndetect deadlocks online before they even happen. Once detected, it utilizes an\nadaptive resolution scheme to resolve deadlocks, under which no stable\ndeadlocks can exist under minor conditions. In addition, the proposed planning\nalgorithm ensures recursive feasibility of the underlying optimization at each\ntime step under both input and model constraints, is concurrent for all robots\nand requires only local communication. Comprehensive simulation and experiment\nstudies are conducted over large-scale multi-robot systems. Significant\nimprovements on success rate are reported, in comparison with other\nstate-of-the-art methods and especially in crowded and high-speed scenarios.\n","authors":["Yuda Chen","Meng Guo","Zhongkui Li"],"pdf_url":"https://arxiv.org/pdf/2202.06071v4.pdf","comment":"16 pages, 15 figures"},{"id":"http://arxiv.org/abs/2404.05953v1","updated":"2024-04-09T02:21:23Z","published":"2024-04-09T02:21:23Z","title":"3D Branch Point Cloud Completion for Robotic Pruning in Apple Orchards","summary":" Robotic branch pruning is a significantly growing research area to cope with\nthe shortage of labor force in the context of agriculture. One fundamental\nrequirement in robotic pruning is the perception of detailed geometry and\ntopology of branches. However, the point clouds obtained in agricultural\nsettings often exhibit incompleteness due to several constraints, thereby\nrestricting the accuracy of downstream robotic pruning. In this work, we\naddressed the issue of point cloud quality through a simulation-based deep\nneural network, leveraging a Real-to-Simulation (Real2Sim) data generation\npipeline that not only eliminates the need for manual parameterization but also\nguarantees the realism of simulated data. The simulation-based neural network\nwas applied to jointly perform point cloud completion and skeletonization on\nreal-world partial branches, without additional real-world training. The\nSim2Real qualitative completion and skeletonization results showed the model's\nremarkable capability for geometry reconstruction and topology prediction.\nAdditionally, we quantitatively evaluated the Sim2Real performance by comparing\nbranch-level trait characterization errors using raw incomplete data and\ncomplete data. The Mean Absolute Error (MAE) reduced by 75% and 8% for branch\ndiameter and branch angle estimation, respectively, using the best complete\ndata, which indicates the effectiveness of the Real2Sim data in a zero-shot\ngeneralization setting. The characterization improvements contributed to the\nprecision and efficacy of robotic branch pruning.\n","authors":["Tian Qiu","Alan Zoubi","Nikolai Spine","Lailiang Cheng","Yu Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.05953v1.pdf","comment":"Submitted to IROS2024"},{"id":"http://arxiv.org/abs/2404.05952v1","updated":"2024-04-09T02:17:19Z","published":"2024-04-09T02:17:19Z","title":"Robot Safe Planning In Dynamic Environments Based On Model Predictive\n Control Using Control Barrier Function","summary":" Implementing obstacle avoidance in dynamic environments is a challenging\nproblem for robots. Model predictive control (MPC) is a popular strategy for\ndealing with this type of problem, and recent work mainly uses control barrier\nfunction (CBF) as hard constraints to ensure that the system state remains in\nthe safe set. However, in crowded scenarios, effective solutions may not be\nobtained due to infeasibility problems, resulting in degraded controller\nperformance. We propose a new MPC framework that integrates CBF to tackle the\nissue of obstacle avoidance in dynamic environments, in which the infeasibility\nproblem induced by hard constraints operating over the whole prediction horizon\nis solved by softening the constraints and introducing exact penalty, prompting\nthe robot to actively seek out new paths. At the same time, generalized CBF is\nextended as a single-step safety constraint of the controller to enhance the\nsafety of the robot during navigation. The efficacy of the proposed method is\nfirst shown through simulation experiments, in which a double-integrator system\nand a unicycle system are employed, and the proposed method outperforms other\ncontrollers in terms of safety, feasibility, and navigation efficiency.\nFurthermore, real-world experiment on an MR1000 robot is implemented to\ndemonstrate the effectiveness of the proposed method.\n","authors":["Zetao Lu","Kaijun Feng","Jun Xu","Haoyao Chen","Yunjiang Lou"],"pdf_url":"https://arxiv.org/pdf/2404.05952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05950v1","updated":"2024-04-09T02:11:35Z","published":"2024-04-09T02:11:35Z","title":"Efficient Multi-Task Reinforcement Learning via Task-Specific Action\n Correction","summary":" Multi-task reinforcement learning (MTRL) demonstrate potential for enhancing\nthe generalization of a robot, enabling it to perform multiple tasks\nconcurrently. However, the performance of MTRL may still be susceptible to\nconflicts between tasks and negative interference. To facilitate efficient\nMTRL, we propose Task-Specific Action Correction (TSAC), a general and\ncomplementary approach designed for simultaneous learning of multiple tasks.\nTSAC decomposes policy learning into two separate policies: a shared policy\n(SP) and an action correction policy (ACP). To alleviate conflicts resulting\nfrom excessive focus on specific tasks' details in SP, ACP incorporates\ngoal-oriented sparse rewards, enabling an agent to adopt a long-term\nperspective and achieve generalization across tasks. Additional rewards\ntransform the original problem into a multi-objective MTRL problem.\nFurthermore, to convert the multi-objective MTRL into a single-objective\nformulation, TSAC assigns a virtual expected budget to the sparse rewards and\nemploys Lagrangian method to transform a constrained single-objective\noptimization into an unconstrained one. Experimental evaluations conducted on\nMeta-World's MT10 and MT50 benchmarks demonstrate that TSAC outperforms\nexisting state-of-the-art methods, achieving significant improvements in both\nsample efficiency and effective action execution.\n","authors":["Jinyuan Feng","Min Chen","Zhiqiang Pu","Tenghai Qiu","Jianqiang Yi"],"pdf_url":"https://arxiv.org/pdf/2404.05950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05932v1","updated":"2024-04-09T01:32:35Z","published":"2024-04-09T01:32:35Z","title":"Body Design and Gait Generation of Chair-Type Asymmetrical Tripedal\n Low-rigidity Robot","summary":" In this study, a chair-type asymmetric tripedal low-rigidity robot was\ndesigned based on the three-legged chair character in the movie \"Suzume\" and\nits gait was generated. Its body structure consists of three legs that are\nasymmetric to the body, so it cannot be easily balanced. In addition, the\nactuator is a servo motor that can only feed-forward rotational angle commands\nand the sensor can only sense the robot's posture quaternion. In such an\nasymmetric and imperfect body structure, we analyzed how gait is generated in\nwalking and stand-up motions by generating gaits with two different methods: a\nmethod using linear completion to connect the postures necessary for the gait\ndiscovered through trial and error using the actual robot, and a method using\nthe gait generated by reinforcement learning in the simulator and reflecting it\nto the actual robot. Both methods were able to generate gait that realized\nwalking and stand-up motions, and interesting gait patterns were observed,\nwhich differed depending on the method, and were confirmed on the actual robot.\nOur code and demonstration videos are available here:\nhttps://github.com/shin0805/Chair-TypeAsymmetricalTripedalRobot.git\n","authors":["Shintaro Inoue","Kento Kawaharazuka","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2404.05932v1.pdf","comment":"Accepted at RoboSoft2024, website -\n https://shin0805.github.io/chair-type-tripedal-robot/ , YouTube -\n https://youtu.be/-f8LDlhmdBg"},{"id":"http://arxiv.org/abs/2309.13475v3","updated":"2024-04-09T01:26:58Z","published":"2023-09-23T20:33:38Z","title":"Detecting and Mitigating System-Level Anomalies of Vision-Based\n Controllers","summary":" Autonomous systems, such as self-driving cars and drones, have made\nsignificant strides in recent years by leveraging visual inputs and machine\nlearning for decision-making and control. Despite their impressive performance,\nthese vision-based controllers can make erroneous predictions when faced with\nnovel or out-of-distribution inputs. Such errors can cascade to catastrophic\nsystem failures and compromise system safety. In this work, we introduce a\nrun-time anomaly monitor to detect and mitigate such closed-loop, system-level\nfailures. Specifically, we leverage a reachability-based framework to\nstress-test the vision-based controller offline and mine its system-level\nfailures. This data is then used to train a classifier that is leveraged online\nto flag inputs that might cause system breakdowns. The anomaly detector\nhighlights issues that transcend individual modules and pertain to the safety\nof the overall system. We also design a fallback controller that robustly\nhandles these detected anomalies to preserve system safety. We validate the\nproposed approach on an autonomous aircraft taxiing system that uses a\nvision-based controller for taxiing. Our results show the efficacy of the\nproposed approach in identifying and handling system-level anomalies,\noutperforming methods such as prediction error-based detection, and ensembling,\nthereby enhancing the overall safety and robustness of autonomous systems.\n","authors":["Aryaman Gupta","Kaustav Chakraborty","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2309.13475v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17540v2","updated":"2024-04-09T23:39:23Z","published":"2023-10-26T16:32:34Z","title":"EqDrive: Efficient Equivariant Motion Forecasting with Multi-Modality\n for Autonomous Driving","summary":" Forecasting vehicular motions in autonomous driving requires a deep\nunderstanding of agent interactions and the preservation of motion equivariance\nunder Euclidean geometric transformations. Traditional models often lack the\nsophistication needed to handle the intricate dynamics inherent to autonomous\nvehicles and the interaction relationships among agents in the scene. As a\nresult, these models have a lower model capacity, which then leads to higher\nprediction errors and lower training efficiency. In our research, we employ\nEqMotion, a leading equivariant particle, and human prediction model that also\naccounts for invariant agent interactions, for the task of multi-agent vehicle\nmotion forecasting. In addition, we use a multi-modal prediction mechanism to\naccount for multiple possible future paths in a probabilistic manner. By\nleveraging EqMotion, our model achieves state-of-the-art (SOTA) performance\nwith fewer parameters (1.2 million) and a significantly reduced training time\n(less than 2 hours).\n","authors":["Yuping Wang","Jier Chen"],"pdf_url":"https://arxiv.org/pdf/2310.17540v2.pdf","comment":"6 pages, 7 figures, Accepted 2024 International Conference on\n Robotics and Automation"},{"id":"http://arxiv.org/abs/2404.05139v2","updated":"2024-04-09T23:17:07Z","published":"2024-04-08T01:38:43Z","title":"Better Monocular 3D Detectors with LiDAR from the Past","summary":" Accurate 3D object detection is crucial to autonomous driving. Though\nLiDAR-based detectors have achieved impressive performance, the high cost of\nLiDAR sensors precludes their widespread adoption in affordable vehicles.\nCamera-based detectors are cheaper alternatives but often suffer inferior\nperformance compared to their LiDAR-based counterparts due to inherent depth\nambiguities in images. In this work, we seek to improve monocular 3D detectors\nby leveraging unlabeled historical LiDAR data. Specifically, at inference time,\nwe assume that the camera-based detectors have access to multiple unlabeled\nLiDAR scans from past traversals at locations of interest (potentially from\nother high-end vehicles equipped with LiDAR sensors). Under this setup, we\nproposed a novel, simple, and end-to-end trainable framework, termed\nAsyncDepth, to effectively extract relevant features from asynchronous LiDAR\ntraversals of the same location for monocular 3D detectors. We show consistent\nand significant performance gain (up to 9 AP) across multiple state-of-the-art\nmodels and datasets with a negligible additional latency of 9.66 ms and a small\nstorage cost.\n","authors":["Yurong You","Cheng Perng Phoo","Carlos Andres Diaz-Ruiz","Katie Z Luo","Wei-Lun Chao","Mark Campbell","Bharath Hariharan","Kilian Q Weinberger"],"pdf_url":"https://arxiv.org/pdf/2404.05139v2.pdf","comment":"Accepted by ICRA 2024. The code can be found at\n https://github.com/YurongYou/AsyncDepth"},{"id":"http://arxiv.org/abs/2404.03729v2","updated":"2024-04-09T22:53:57Z","published":"2024-04-04T18:00:15Z","title":"JUICER: Data-Efficient Imitation Learning for Robotic Assembly","summary":" While learning from demonstrations is powerful for acquiring visuomotor\npolicies, high-performance imitation without large demonstration datasets\nremains challenging for tasks requiring precise, long-horizon manipulation.\nThis paper proposes a pipeline for improving imitation learning performance\nwith a small human demonstration budget. We apply our approach to assembly\ntasks that require precisely grasping, reorienting, and inserting multiple\nparts over long horizons and multiple task phases. Our pipeline combines\nexpressive policy architectures and various techniques for dataset expansion\nand simulation-based data augmentation. These help expand dataset support and\nsupervise the model with locally corrective actions near bottleneck regions\nrequiring high precision. We demonstrate our pipeline on four furniture\nassembly tasks in simulation, enabling a manipulator to assemble up to five\nparts over nearly 2500 time steps directly from RGB images, outperforming\nimitation and data augmentation baselines. Project website:\nhttps://imitation-juicer.github.io/.\n","authors":["Lars Ankile","Anthony Simeonov","Idan Shenfeld","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2404.03729v2.pdf","comment":"Project website: https://imitation-juicer.github.io/"},{"id":"http://arxiv.org/abs/2404.06645v1","updated":"2024-04-09T22:47:25Z","published":"2024-04-09T22:47:25Z","title":"GenCHiP: Generating Robot Policy Code for High-Precision and\n Contact-Rich Manipulation Tasks","summary":" Large Language Models (LLMs) have been successful at generating robot policy\ncode, but so far these results have been limited to high-level tasks that do\nnot require precise movement. It is an open question how well such approaches\nwork for tasks that require reasoning over contact forces and working within\ntight success tolerances. We find that, with the right action space, LLMs are\ncapable of successfully generating policies for a variety of contact-rich and\nhigh-precision manipulation tasks, even under noisy conditions, such as\nperceptual errors or grasping inaccuracies. Specifically, we reparameterize the\naction space to include compliance with constraints on the interaction forces\nand stiffnesses involved in reaching a target pose. We validate this approach\non subtasks derived from the Functional Manipulation Benchmark (FMB) and NIST\nTask Board Benchmarks. Exposing this action space alongside methods for\nestimating object poses improves policy generation with an LLM by greater than\n3x and 4x when compared to non-compliant action spaces\n","authors":["Kaylee Burns","Ajinkya Jain","Keegan Go","Fei Xia","Michael Stark","Stefan Schaal","Karol Hausman"],"pdf_url":"https://arxiv.org/pdf/2404.06645v1.pdf","comment":"14 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.06631v1","updated":"2024-04-09T21:46:14Z","published":"2024-04-09T21:46:14Z","title":"Counting Objects in a Robotic Hand","summary":" A robot performing multi-object grasping needs to sense the number of objects\nin the hand after grasping. The count plays an important role in determining\nthe robot's next move and the outcome and efficiency of the whole pick-place\nprocess. This paper presents a data-driven contrastive learning-based counting\nclassifier with a modified loss function as a simple and effective approach for\nobject counting despite significant occlusion challenges caused by robotic\nfingers and objects. The model was validated against other models with three\ndifferent common shapes (spheres, cylinders, and cubes) in simulation and in a\nreal setup. The proposed contrastive learning-based counting approach achieved\nabove 96\\% accuracy for all three objects in the real setup.\n","authors":["Francis Tsow","Tianze Chen","Yu Sun"],"pdf_url":"https://arxiv.org/pdf/2404.06631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00748v4","updated":"2024-04-09T21:19:29Z","published":"2024-03-01T18:48:48Z","title":"Primal-Dual iLQR","summary":" We introduce a new algorithm for solving unconstrained discrete-time optimal\ncontrol problems. Our method follows a direct multiple shooting approach, and\nconsists of applying the SQP method together with an $\\ell_2$ augmented\nLagrangian primal-dual merit function. We use the LQR algorithm to efficiently\nsolve the primal-dual Newton-KKT system. As our algorithm is a specialization\nof NPSQP, it inherits its generic properties, including global convergence,\nfast local convergence, and the lack of need for second order corrections or\ndimension expansions, improving on existing direct multiple shooting approaches\nsuch as acados, ALTRO, GNMS, FATROP, and FDDP. As our algorithm avoids\nsequential rollouts of the nonlinear dynamics, it can be combined with\n(S\\\"arkk\\\"a and Garc\\'ia-Fern\\'andez, 2023) to run in $O(\\log(N))$ parallel\ntime per iteration (where $N$ is the number of stages), as well as $O(1)$\nparallel time per line search iteration. Therefore, this paper provides a\npractical, theoretically sound, and highly parallelizable (for example, with a\nGPU) method for solving nonlinear discrete-time optimal control problems.\n","authors":["João Sousa-Pinto","Dominique Orban"],"pdf_url":"https://arxiv.org/pdf/2403.00748v4.pdf","comment":"8 pages, 1 figure, 1 table"},{"id":"http://arxiv.org/abs/2404.06609v1","updated":"2024-04-09T20:40:00Z","published":"2024-04-09T20:40:00Z","title":"GOAT-Bench: A Benchmark for Multi-Modal Lifelong Navigation","summary":" The Embodied AI community has made significant strides in visual navigation\ntasks, exploring targets from 3D coordinates, objects, language descriptions,\nand images. However, these navigation models often handle only a single input\nmodality as the target. With the progress achieved so far, it is time to move\ntowards universal navigation models capable of handling various goal types,\nenabling more effective user interaction with robots. To facilitate this goal,\nwe propose GOAT-Bench, a benchmark for the universal navigation task referred\nto as GO to AnyThing (GOAT). In this task, the agent is directed to navigate to\na sequence of targets specified by the category name, language description, or\nimage in an open-vocabulary fashion. We benchmark monolithic RL and modular\nmethods on the GOAT task, analyzing their performance across modalities, the\nrole of explicit and implicit scene memories, their robustness to noise in goal\nspecifications, and the impact of memory in lifelong scenarios.\n","authors":["Mukul Khanna","Ram Ramrakhya","Gunjan Chhablani","Sriram Yenamandra","Theophile Gervet","Matthew Chang","Zsolt Kira","Devendra Singh Chaplot","Dhruv Batra","Roozbeh Mottaghi"],"pdf_url":"https://arxiv.org/pdf/2404.06609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06570v1","updated":"2024-04-09T18:43:09Z","published":"2024-04-09T18:43:09Z","title":"MORPHeus: a Multimodal One-armed Robot-assisted Peeling System with\n Human Users In-the-loop","summary":" Meal preparation is an important instrumental activity of daily\nliving~(IADL). While existing research has explored robotic assistance in meal\npreparation tasks such as cutting and cooking, the crucial task of peeling has\nreceived less attention. Robot-assisted peeling, conventionally a bimanual\ntask, is challenging to deploy in the homes of care recipients using two\nwheelchair-mounted robot arms due to ergonomic and transferring challenges.\nThis paper introduces a robot-assisted peeling system utilizing a single\nrobotic arm and an assistive cutting board, inspired by the way individuals\nwith one functional hand prepare meals. Our system incorporates a multimodal\nactive perception module to determine whether an area on the food is peeled, a\nhuman-in-the-loop long-horizon planner to perform task planning while catering\nto a user's preference for peeling coverage, and a compliant controller to peel\nthe food items. We demonstrate the system on 12 food items representing the\nextremes of different shapes, sizes, skin thickness, surface textures, skin vs\nflesh colors, and deformability.\n","authors":["Ruolin Ye","Yifei Hu"," Yuhan"," Bian","Luke Kulm","Tapomayukh Bhattacharjee"],"pdf_url":"https://arxiv.org/pdf/2404.06570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06561v1","updated":"2024-04-09T18:25:21Z","published":"2024-04-09T18:25:21Z","title":"Learning Strategies For Successful Crowd Navigation","summary":" Teaching autonomous mobile robots to successfully navigate human crowds is a\nchallenging task. Not only does it require planning, but it requires\nmaintaining social norms which may differ from one context to another. Here we\nfocus on crowd navigation, using a neural network to learn specific strategies\nin-situ with a robot. This allows us to take into account human behavior and\nreactions toward a real robot as well as learn strategies that are specific to\nvarious scenarios in that context. A CNN takes a top-down image of the scene as\ninput and outputs the next action for the robot to take in terms of speed and\nangle. Here we present the method, experimental results, and quantitatively\nevaluate our approach.\n","authors":["Rajshree Daulatabad","Serena Nath"],"pdf_url":"https://arxiv.org/pdf/2404.06561v1.pdf","comment":"8 pages"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.06512v1","updated":"2024-04-09T17:59:32Z","published":"2024-04-09T17:59:32Z","title":"InternLM-XComposer2-4KHD: A Pioneering Large Vision-Language Model\n Handling Resolutions from 336 Pixels to 4K HD","summary":" The Large Vision-Language Model (LVLM) field has seen significant\nadvancements, yet its progression has been hindered by challenges in\ncomprehending fine-grained visual content due to limited resolution. Recent\nefforts have aimed to enhance the high-resolution understanding capabilities of\nLVLMs, yet they remain capped at approximately 1500 x 1500 pixels and\nconstrained to a relatively narrow resolution range. This paper represents\nInternLM-XComposer2-4KHD, a groundbreaking exploration into elevating LVLM\nresolution capabilities up to 4K HD (3840 x 1600) and beyond. Concurrently,\nconsidering the ultra-high resolution may not be necessary in all scenarios, it\nsupports a wide range of diverse resolutions from 336 pixels to 4K standard,\nsignificantly broadening its scope of applicability. Specifically, this\nresearch advances the patch division paradigm by introducing a novel extension:\ndynamic resolution with automatic patch configuration. It maintains the\ntraining image aspect ratios while automatically varying patch counts and\nconfiguring layouts based on a pre-trained Vision Transformer (ViT) (336 x\n336), leading to dynamic training resolution from 336 pixels to 4K standard.\nOur research demonstrates that scaling training resolution up to 4K HD leads to\nconsistent performance enhancements without hitting the ceiling of potential\nimprovements. InternLM-XComposer2-4KHD shows superb capability that matches or\neven surpasses GPT-4V and Gemini Pro in 10 of the 16 benchmarks. The\nInternLM-XComposer2-4KHD model series with 7B parameters are publicly available\nat https://github.com/InternLM/InternLM-XComposer.\n","authors":["Xiaoyi Dong","Pan Zhang","Yuhang Zang","Yuhang Cao","Bin Wang","Linke Ouyang","Songyang Zhang","Haodong Duan","Wenwei Zhang","Yining Li","Hang Yan","Yang Gao","Zhe Chen","Xinyue Zhang","Wei Li","Jingwen Li","Wenhai Wang","Kai Chen","Conghui He","Xingcheng Zhang","Jifeng Dai","Yu Qiao","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06512v1.pdf","comment":"Code and models are publicly available at\n https://github.com/InternLM/InternLM-XComposer"},{"id":"http://arxiv.org/abs/2404.06511v1","updated":"2024-04-09T17:59:31Z","published":"2024-04-09T17:59:31Z","title":"MoReVQA: Exploring Modular Reasoning Models for Video Question Answering","summary":" This paper addresses the task of video question answering (videoQA) via a\ndecomposed multi-stage, modular reasoning framework. Previous modular methods\nhave shown promise with a single planning stage ungrounded in visual content.\nHowever, through a simple and effective baseline, we find that such systems can\nlead to brittle behavior in practice for challenging videoQA settings. Thus,\nunlike traditional single-stage planning methods, we propose a multi-stage\nsystem consisting of an event parser, a grounding stage, and a final reasoning\nstage in conjunction with an external memory. All stages are training-free, and\nperformed using few-shot prompting of large models, creating interpretable\nintermediate outputs at each stage. By decomposing the underlying planning and\ntask complexity, our method, MoReVQA, improves over prior work on standard\nvideoQA benchmarks (NExT-QA, iVQA, EgoSchema, ActivityNet-QA) with\nstate-of-the-art results, and extensions to related tasks (grounded videoQA,\nparagraph captioning).\n","authors":["Juhong Min","Shyamal Buch","Arsha Nagrani","Minsu Cho","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2404.06511v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.06510v1","updated":"2024-04-09T17:59:04Z","published":"2024-04-09T17:59:04Z","title":"Can Feedback Enhance Semantic Grounding in Large Vision-Language Models?","summary":" Enhancing semantic grounding abilities in Vision-Language Models (VLMs) often\ninvolves collecting domain-specific training data, refining the network\narchitectures, or modifying the training recipes. In this work, we venture into\nan orthogonal direction and explore whether VLMs can improve their semantic\ngrounding by \"receiving\" feedback, without requiring in-domain data,\nfine-tuning, or modifications to the network architectures. We systematically\nanalyze this hypothesis using a feedback mechanism composed of a binary signal.\nWe find that if prompted appropriately, VLMs can utilize feedback both in a\nsingle step and iteratively, showcasing the potential of feedback as an\nalternative technique to improve grounding in internet-scale VLMs. Furthermore,\nVLMs, like LLMs, struggle to self-correct errors out-of-the-box. However, we\nfind that this issue can be mitigated via a binary verification mechanism.\nFinally, we explore the potential and limitations of amalgamating these\nfindings and applying them iteratively to automatically enhance VLMs' grounding\nperformance, showing grounding accuracy consistently improves using automated\nfeedback across all models in all settings investigated. Overall, our iterative\nframework improves semantic grounding in VLMs by more than 15 accuracy points\nunder noise-free feedback and up to 5 accuracy points under a simple automated\nbinary verification mechanism. The project website is hosted at\nhttps://andrewliao11.github.io/vlms_feedback\n","authors":["Yuan-Hong Liao","Rafid Mahmood","Sanja Fidler","David Acuna"],"pdf_url":"https://arxiv.org/pdf/2404.06510v1.pdf","comment":"31 pages, 15 figures"},{"id":"http://arxiv.org/abs/2404.06507v1","updated":"2024-04-09T17:55:41Z","published":"2024-04-09T17:55:41Z","title":"Reconstructing Hand-Held Objects in 3D","summary":" Objects manipulated by the hand (i.e., manipulanda) are particularly\nchallenging to reconstruct from in-the-wild RGB images or videos. Not only does\nthe hand occlude much of the object, but also the object is often only visible\nin a small number of image pixels. At the same time, two strong anchors emerge\nin this setting: (1) estimated 3D hands help disambiguate the location and\nscale of the object, and (2) the set of manipulanda is small relative to all\npossible objects. With these insights in mind, we present a scalable paradigm\nfor handheld object reconstruction that builds on recent breakthroughs in large\nlanguage/vision models and 3D object datasets. Our model, MCC-Hand-Object\n(MCC-HO), jointly reconstructs hand and object geometry given a single RGB\nimage and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve\na 3D object model that matches the object in the image and rigidly align the\nmodel to the network-inferred geometry; we call this alignment\nRetrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO\nachieves state-of-the-art performance on lab and Internet datasets, and we show\nhow RAR can be used to automatically obtain 3D labels for in-the-wild images of\nhand-object interactions.\n","authors":["Jane Wu","Georgios Pavlakos","Georgia Gkioxari","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2404.06507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17048v3","updated":"2024-04-09T17:54:12Z","published":"2023-11-28T18:55:37Z","title":"Zero-shot Referring Expression Comprehension via Structural Similarity\n Between Images and Captions","summary":" Zero-shot referring expression comprehension aims at localizing bounding\nboxes in an image corresponding to provided textual prompts, which requires:\n(i) a fine-grained disentanglement of complex visual scene and textual context,\nand (ii) a capacity to understand relationships among disentangled entities.\nUnfortunately, existing large vision-language alignment (VLA) models, e.g.,\nCLIP, struggle with both aspects so cannot be directly used for this task. To\nmitigate this gap, we leverage large foundation models to disentangle both\nimages and texts into triplets in the format of (subject, predicate, object).\nAfter that, grounding is accomplished by calculating the structural similarity\nmatrix between visual and textual triplets with a VLA model, and subsequently\npropagate it to an instance-level similarity matrix. Furthermore, to equip VLA\nmodels with the ability of relationship understanding, we design a\ntriplet-matching objective to fine-tune the VLA models on a collection of\ncurated dataset containing abundant entity relationships. Experiments\ndemonstrate that our visual grounding performance increase of up to 19.5% over\nthe SOTA zero-shot model on RefCOCO/+/g. On the more challenging Who's Waldo\ndataset, our zero-shot approach achieves comparable accuracy to the fully\nsupervised model. Code is available at\nhttps://github.com/Show-han/Zeroshot_REC.\n","authors":["Zeyu Han","Fangrui Zhu","Qianru Lao","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.17048v3.pdf","comment":"CVPR 2024, Code available at https://github.com/Show-han/Zeroshot_REC"},{"id":"http://arxiv.org/abs/2212.08731v3","updated":"2024-04-09T17:52:49Z","published":"2022-12-16T22:03:37Z","title":"Multi-person 3D pose estimation from unlabelled data","summary":" Its numerous applications make multi-human 3D pose estimation a remarkably\nimpactful area of research. Nevertheless, assuming a multiple-view system\ncomposed of several regular RGB cameras, 3D multi-pose estimation presents\nseveral challenges. First of all, each person must be uniquely identified in\nthe different views to separate the 2D information provided by the cameras.\nSecondly, the 3D pose estimation process from the multi-view 2D information of\neach person must be robust against noise and potential occlusions in the\nscenario. In this work, we address these two challenges with the help of deep\nlearning. Specifically, we present a model based on Graph Neural Networks\ncapable of predicting the cross-view correspondence of the people in the\nscenario along with a Multilayer Perceptron that takes the 2D points to yield\nthe 3D poses of each person. These two models are trained in a self-supervised\nmanner, thus avoiding the need for large datasets with 3D annotations.\n","authors":["Daniel Rodriguez-Criado","Pilar Bachiller","George Vogiatzis","Luis J. Manso"],"pdf_url":"https://arxiv.org/pdf/2212.08731v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06493v1","updated":"2024-04-09T17:48:52Z","published":"2024-04-09T17:48:52Z","title":"Flying With Photons: Rendering Novel Views of Propagating Light","summary":" We present an imaging and neural rendering technique that seeks to synthesize\nvideos of light propagating through a scene from novel, moving camera\nviewpoints. Our approach relies on a new ultrafast imaging setup to capture a\nfirst-of-its kind, multi-viewpoint video dataset with picosecond-level temporal\nresolution. Combined with this dataset, we introduce an efficient neural volume\nrendering framework based on the transient field. This field is defined as a\nmapping from a 3D point and 2D direction to a high-dimensional, discrete-time\nsignal that represents time-varying radiance at ultrafast timescales. Rendering\nwith transient fields naturally accounts for effects due to the finite speed of\nlight, including viewpoint-dependent appearance changes caused by light\npropagation delays to the camera. We render a range of complex effects,\nincluding scattering, specular reflection, refraction, and diffraction.\nAdditionally, we demonstrate removing viewpoint-dependent propagation delays\nusing a time warping procedure, rendering of relativistic effects, and video\nsynthesis of direct and global components of light transport.\n","authors":["Anagh Malik","Noah Juravsky","Ryan Po","Gordon Wetzstein","Kiriakos N. Kutulakos","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2404.06493v1.pdf","comment":"Project page: https://anaghmalik.com/FlyingWithPhotons/"},{"id":"http://arxiv.org/abs/2303.12054v4","updated":"2024-04-09T17:44:24Z","published":"2023-03-21T17:45:38Z","title":"Influencer Backdoor Attack on Semantic Segmentation","summary":" When a small number of poisoned samples are injected into the training\ndataset of a deep neural network, the network can be induced to exhibit\nmalicious behavior during inferences, which poses potential threats to\nreal-world applications. While they have been intensively studied in\nclassification, backdoor attacks on semantic segmentation have been largely\noverlooked. Unlike classification, semantic segmentation aims to classify every\npixel within a given image. In this work, we explore backdoor attacks on\nsegmentation models to misclassify all pixels of a victim class by injecting a\nspecific trigger on non-victim pixels during inferences, which is dubbed\nInfluencer Backdoor Attack (IBA). IBA is expected to maintain the\nclassification accuracy of non-victim pixels and mislead classifications of all\nvictim pixels in every single inference and could be easily applied to\nreal-world scenes. Based on the context aggregation ability of segmentation\nmodels, we proposed a simple, yet effective, Nearest-Neighbor trigger injection\nstrategy. We also introduce an innovative Pixel Random Labeling strategy which\nmaintains optimal performance even when the trigger is placed far from the\nvictim pixels. Our extensive experiments reveal that current segmentation\nmodels do suffer from backdoor attacks, demonstrate IBA real-world\napplicability, and show that our proposed techniques can further increase\nattack performance.\n","authors":["Haoheng Lan","Jindong Gu","Philip Torr","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.12054v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06483v1","updated":"2024-04-09T17:34:19Z","published":"2024-04-09T17:34:19Z","title":"RhythmMamba: Fast Remote Physiological Measurement with Arbitrary Length\n Videos","summary":" Remote photoplethysmography (rPPG) is a non-contact method for detecting\nphysiological signals from facial videos, holding great potential in various\napplications such as healthcare, affective computing, and anti-spoofing.\nExisting deep learning methods struggle to address two core issues of rPPG\nsimultaneously: extracting weak rPPG signals from video segments with large\nspatiotemporal redundancy and understanding the periodic patterns of rPPG among\nlong contexts. This represents a trade-off between computational complexity and\nthe ability to capture long-range dependencies, posing a challenge for rPPG\nthat is suitable for deployment on mobile devices. Based on the in-depth\nexploration of Mamba's comprehension of spatial and temporal information, this\npaper introduces RhythmMamba, an end-to-end Mamba-based method that employs\nmulti-temporal Mamba to constrain both periodic patterns and short-term trends,\ncoupled with frequency domain feed-forward to enable Mamba to robustly\nunderstand the quasi-periodic patterns of rPPG. Extensive experiments show that\nRhythmMamba achieves state-of-the-art performance with reduced parameters and\nlower computational complexity. The proposed RhythmMamba can be applied to\nvideo segments of any length without performance degradation. The codes are\navailable at https://github.com/zizheng-guo/RhythmMamba.\n","authors":["Bochao Zou","Zizheng Guo","Xiaocheng Hu","Huimin Ma"],"pdf_url":"https://arxiv.org/pdf/2404.06483v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2402.12788"},{"id":"http://arxiv.org/abs/2404.06479v1","updated":"2024-04-09T17:30:18Z","published":"2024-04-09T17:30:18Z","title":"Text-Based Reasoning About Vector Graphics","summary":" While large multimodal models excel in broad vision-language benchmarks, they\noften struggle with tasks requiring precise perception of low-level visual\ndetails, such as comparing line lengths or solving simple mazes. In particular,\nthis failure mode persists in question-answering tasks about vector graphics --\nimages composed purely of 2D objects and shapes. To address this challenge, we\npropose the Visually Descriptive Language Model (VDLM), which performs\ntext-based reasoning about vector graphics. VDLM leverages Scalable Vector\nGraphics (SVG) for a more precise visual description and first uses an\noff-the-shelf raster-to-SVG algorithm for encoding. Since existing language\nmodels cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG\nwith pretrained language models through a newly introduced intermediate\nsymbolic representation, Primal Visual Description (PVD), comprising primitive\nattributes (e.g., shape, position, measurement) with their corresponding\npredicted values. PVD is task-agnostic and represents visual primitives that\nare universal across all vector graphics. It can be learned with procedurally\ngenerated (SVG, PVD) pairs and also enables the direct use of LLMs for\ngeneralization to complex reasoning tasks. By casting an image to a text-based\nrepresentation, we can leverage the power of language models to learn alignment\nfrom SVG to visual primitives and generalize to unseen question-answering\ntasks. Empirical results show that VDLM achieves stronger zero-shot performance\ncompared to state-of-the-art LMMs, such as GPT-4V, in various low-level\nmultimodal perception and reasoning tasks on vector graphics. We additionally\npresent extensive analyses on VDLM's performance, demonstrating that our\nframework offers better interpretability due to its disentangled perception and\nreasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/\n","authors":["Zhenhailong Wang","Joy Hsu","Xingyao Wang","Kuan-Hao Huang","Manling Li","Jiajun Wu","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2404.06479v1.pdf","comment":"Project page: https://mikewangwzhl.github.io/VDLM/"},{"id":"http://arxiv.org/abs/2404.06470v1","updated":"2024-04-09T17:17:48Z","published":"2024-04-09T17:17:48Z","title":"Learning State-Invariant Representations of Objects from Image\n Collections with State, Pose, and Viewpoint Changes","summary":" We add one more invariance - state invariance - to the more commonly used\nother invariances for learning object representations for recognition and\nretrieval. By state invariance, we mean robust with respect to changes in the\nstructural form of the object, such as when an umbrella is folded, or when an\nitem of clothing is tossed on the floor. Since humans generally have no\ndifficulty in recognizing objects despite such state changes, we are naturally\nfaced with the question of whether it is possible to devise a neural\narchitecture with similar abilities. To that end, we present a novel dataset,\nObjectsWithStateChange, that captures state and pose variations in the object\nimages recorded from arbitrary viewpoints. We believe that this dataset will\nfacilitate research in fine-grained object recognition and retrieval of objects\nthat are capable of state changes. The goal of such research would be to train\nmodels capable of generating object embeddings that remain invariant to state\nchanges while also staying invariant to transformations induced by changes in\nviewpoint, pose, illumination, etc. To demonstrate the usefulness of the\nObjectsWithStateChange dataset, we also propose a curriculum learning strategy\nthat uses the similarity relationships in the learned embedding space after\neach epoch to guide the training process. The model learns discriminative\nfeatures by comparing visually similar objects within and across different\ncategories, encouraging it to differentiate between objects that may be\nchallenging to distinguish due to changes in their state. We believe that this\nstrategy enhances the model's ability to capture discriminative features for\nfine-grained tasks that may involve objects with state changes, leading to\nperformance improvements on object-level tasks not only on our new dataset, but\nalso on two other challenging multi-view datasets such as ModelNet40 and\nObjectPI.\n","authors":["Rohan Sarkar","Avinash Kak"],"pdf_url":"https://arxiv.org/pdf/2404.06470v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2304.06140v3","updated":"2024-04-09T17:09:03Z","published":"2023-04-12T19:47:13Z","title":"An Edit Friendly DDPM Noise Space: Inversion and Manipulations","summary":" Denoising diffusion probabilistic models (DDPMs) employ a sequence of white\nGaussian noise samples to generate an image. In analogy with GANs, those noise\nmaps could be considered as the latent code associated with the generated\nimage. However, this native noise space does not possess a convenient\nstructure, and is thus challenging to work with in editing tasks. Here, we\npropose an alternative latent noise space for DDPM that enables a wide range of\nediting operations via simple means, and present an inversion method for\nextracting these edit-friendly noise maps for any given image (real or\nsynthetically generated). As opposed to the native DDPM noise space, the\nedit-friendly noise maps do not have a standard normal distribution and are not\nstatistically independent across timesteps. However, they allow perfect\nreconstruction of any desired image, and simple transformations on them\ntranslate into meaningful manipulations of the output image (e.g. shifting,\ncolor edits). Moreover, in text-conditional models, fixing those noise maps\nwhile changing the text prompt, modifies semantics while retaining structure.\nWe illustrate how this property enables text-based editing of real images via\nthe diverse DDPM sampling scheme (in contrast to the popular non-diverse DDIM\ninversion). We also show how it can be used within existing diffusion-based\nediting methods to improve their quality and diversity. Webpage:\nhttps://inbarhub.github.io/DDPM_inversion\n","authors":["Inbar Huberman-Spiegelglas","Vladimir Kulikov","Tomer Michaeli"],"pdf_url":"https://arxiv.org/pdf/2304.06140v3.pdf","comment":"CVPR 2024. Code and examples are available at\n https://github.com/inbarhub/DDPM_inversion"},{"id":"http://arxiv.org/abs/2404.06455v1","updated":"2024-04-09T16:55:23Z","published":"2024-04-09T16:55:23Z","title":"A comparative analysis of deep learning models for lung segmentation on\n X-ray images","summary":" Robust and highly accurate lung segmentation in X-rays is crucial in medical\nimaging. This study evaluates deep learning solutions for this task, ranking\nexisting methods and analyzing their performance under diverse image\nmodifications. Out of 61 analyzed papers, only nine offered implementation or\npre-trained models, enabling assessment of three prominent methods: Lung VAE,\nTransResUNet, and CE-Net. The analysis revealed that CE-Net performs best,\ndemonstrating the highest values in dice similarity coefficient and\nintersection over union metric.\n","authors":["Weronika Hryniewska-Guzik","Jakub Bilski","Bartosz Chrostowski","Jakub Drak Sbahi","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2404.06455v1.pdf","comment":"published at the Polish Conference on Artificial Intelligence\n (PP-RAI), 2024"},{"id":"http://arxiv.org/abs/2404.06453v1","updated":"2024-04-09T16:54:19Z","published":"2024-04-09T16:54:19Z","title":"PURE: Turning Polysemantic Neurons Into Pure Features by Identifying\n Relevant Circuits","summary":" The field of mechanistic interpretability aims to study the role of\nindividual neurons in Deep Neural Networks. Single neurons, however, have the\ncapability to act polysemantically and encode for multiple (unrelated)\nfeatures, which renders their interpretation difficult. We present a method for\ndisentangling polysemanticity of any Deep Neural Network by decomposing a\npolysemantic neuron into multiple monosemantic \"virtual\" neurons. This is\nachieved by identifying the relevant sub-graph (\"circuit\") for each \"pure\"\nfeature. We demonstrate how our approach allows us to find and disentangle\nvarious polysemantic units of ResNet models trained on ImageNet. While\nevaluating feature visualizations using CLIP, our method effectively\ndisentangles representations, improving upon methods based on neuron\nactivations. Our code is available at https://github.com/maxdreyer/PURE.\n","authors":["Maximilian Dreyer","Erblina Purelku","Johanna Vielhaben","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2404.06453v1.pdf","comment":"14 pages (4 pages manuscript, 2 pages references, 8 pages appendix)"},{"id":"http://arxiv.org/abs/2404.06451v1","updated":"2024-04-09T16:53:43Z","published":"2024-04-09T16:53:43Z","title":"SmartControl: Enhancing ControlNet for Handling Rough Visual Conditions","summary":" Human visual imagination usually begins with analogies or rough sketches. For\nexample, given an image with a girl playing guitar before a building, one may\nanalogously imagine how it seems like if Iron Man playing guitar before Pyramid\nin Egypt. Nonetheless, visual condition may not be precisely aligned with the\nimaginary result indicated by text prompt, and existing layout-controllable\ntext-to-image (T2I) generation models is prone to producing degraded generated\nresults with obvious artifacts. To address this issue, we present a novel T2I\ngeneration method dubbed SmartControl, which is designed to modify the rough\nvisual conditions for adapting to text prompt. The key idea of our SmartControl\nis to relax the visual condition on the areas that are conflicted with text\nprompts. In specific, a Control Scale Predictor (CSP) is designed to identify\nthe conflict regions and predict the local control scales, while a dataset with\ntext prompts and rough visual conditions is constructed for training CSP. It is\nworth noting that, even with a limited number (e.g., 1,000~2,000) of training\nsamples, our SmartControl can generalize well to unseen objects. Extensive\nexperiments on four typical visual condition types clearly show the efficacy of\nour SmartControl against state-of-the-arts. Source code, pre-trained models,\nand datasets are available at https://github.com/liuxiaoyu1104/SmartControl.\n","authors":["Xiaoyu Liu","Yuxiang Wei","Ming Liu","Xianhui Lin","Peiran Ren","Xuansong Xie","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.06451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06447v1","updated":"2024-04-09T16:49:42Z","published":"2024-04-09T16:49:42Z","title":"The Central Spanning Tree Problem","summary":" Spanning trees are an important primitive in many data analysis tasks, when a\ndata set needs to be summarized in terms of its \"skeleton\", or when a\ntree-shaped graph over all observations is required for downstream processing.\nPopular definitions of spanning trees include the minimum spanning tree and the\noptimum distance spanning tree, a.k.a. the minimum routing cost tree. When\nsearching for the shortest spanning tree but admitting additional branching\npoints, even shorter spanning trees can be realized: Steiner trees.\nUnfortunately, both minimum spanning and Steiner trees are not robust with\nrespect to noise in the observations; that is, small perturbations of the\noriginal data set often lead to drastic changes in the associated spanning\ntrees. In response, we make two contributions when the data lies in a Euclidean\nspace: on the theoretical side, we introduce a new optimization problem, the\n\"(branched) central spanning tree\", which subsumes all previously mentioned\ndefinitions as special cases. On the practical side, we show empirically that\nthe (branched) central spanning tree is more robust to noise in the data, and\nas such is better suited to summarize a data set in terms of its skeleton. We\nalso propose a heuristic to address the NP-hard optimization problem, and\nillustrate its use on single cell RNA expression data from biology and 3D point\nclouds of plants.\n","authors":["Enrique Fita Sanmartín","Christoph Schnörr","Fred A. Hamprecht"],"pdf_url":"https://arxiv.org/pdf/2404.06447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06443v1","updated":"2024-04-09T16:45:34Z","published":"2024-04-09T16:45:34Z","title":"Multi-scale Dynamic and Hierarchical Relationship Modeling for Facial\n Action Units Recognition","summary":" Human facial action units (AUs) are mutually related in a hierarchical\nmanner, as not only they are associated with each other in both spatial and\ntemporal domains but also AUs located in the same/close facial regions show\nstronger relationships than those of different facial regions. While none of\nexisting approach thoroughly model such hierarchical inter-dependencies among\nAUs, this paper proposes to comprehensively model multi-scale AU-related\ndynamic and hierarchical spatio-temporal relationship among AUs for their\noccurrences recognition. Specifically, we first propose a novel multi-scale\ntemporal differencing network with an adaptive weighting block to explicitly\ncapture facial dynamics across frames at different spatial scales, which\nspecifically considers the heterogeneity of range and magnitude in different\nAUs' activation. Then, a two-stage strategy is introduced to hierarchically\nmodel the relationship among AUs based on their spatial distribution (i.e.,\nlocal and cross-region AU relationship modelling). Experimental results\nachieved on BP4D and DISFA show that our approach is the new state-of-the-art\nin the field of AU occurrence recognition. Our code is publicly available at\nhttps://github.com/CVI-SZU/MDHR.\n","authors":["Zihan Wang","Siyang Song","Cheng Luo","Songhe Deng","Weicheng Xie","Linlin Shen"],"pdf_url":"https://arxiv.org/pdf/2404.06443v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2404.06442v1","updated":"2024-04-09T16:42:54Z","published":"2024-04-09T16:42:54Z","title":"QueSTMaps: Queryable Semantic Topological Maps for 3D Scene\n Understanding","summary":" Understanding the structural organisation of 3D indoor scenes in terms of\nrooms is often accomplished via floorplan extraction. Robotic tasks such as\nplanning and navigation require a semantic understanding of the scene as well.\nThis is typically achieved via object-level semantic segmentation. However,\nsuch methods struggle to segment out topological regions like \"kitchen\" in the\nscene. In this work, we introduce a two-step pipeline. First, we extract a\ntopological map, i.e., floorplan of the indoor scene using a novel\nmulti-channel occupancy representation. Then, we generate CLIP-aligned features\nand semantic labels for every room instance based on the objects it contains\nusing a self-attention transformer. Our language-topology alignment supports\nnatural language querying, e.g., a \"place to cook\" locates the \"kitchen\". We\noutperform the current state-of-the-art on room segmentation by ~20% and room\nclassification by ~12%. Our detailed qualitative analysis and ablation studies\nprovide insights into the problem of joint structural and semantic 3D scene\nunderstanding.\n","authors":["Yash Mehan","Kumaraditya Gupta","Rohit Jayanti","Anirudh Govil","Sourav Garg","Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.06442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.12962v2","updated":"2024-04-09T16:39:00Z","published":"2021-10-25T13:56:00Z","title":"Event Data Association via Robust Model Fitting for Event-based Object\n Tracking","summary":" Event-based approaches, which are based on bio-inspired asynchronous event\ncameras, have achieved promising performance on various computer vision tasks.\nHowever, the study of the fundamental event data association problem is still\nin its infancy. In this paper, we propose a novel Event Data Association\n(called EDA) approach to explicitly address the event association and fusion\nproblem. The proposed EDA seeks for event trajectories that best fit the event\ndata, in order to perform unifying data association and information fusion. In\nEDA, we first asynchronously fuse the event data based on its information\nentropy. Then, we introduce a deterministic model hypothesis generation\nstrategy, which effectively generates model hypotheses from the fused events,\nto represent the corresponding event trajectories. After that, we present a\ntwo-stage weighting algorithm, which robustly weighs and selects true models\nfrom the generated model hypotheses, through multi-structural geometric model\nfitting. Meanwhile, we also propose an adaptive model selection strategy to\nautomatically determine the number of the true models. Finally, we use the\nselected true models to associate and fuse the event data, without being\naffected by sensor noise and irrelevant structures. We evaluate the performance\nof the proposed EDA on the object tracking task. The experimental results show\nthe effectiveness of EDA under challenging scenarios, such as high speed,\nmotion blur, and high dynamic range conditions.\n","authors":["Haosheng Chen","Shuyuan Lin","Yan Yan","Hanzi Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2110.12962v2.pdf","comment":"32 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.02408v2","updated":"2024-04-09T16:35:41Z","published":"2024-03-04T19:06:13Z","title":"A Spatio-temporal Aligned SUNet Model for Low-light Video Enhancement","summary":" Distortions caused by low-light conditions are not only visually unpleasant\nbut also degrade the performance of computer vision tasks. The restoration and\nenhancement have proven to be highly beneficial. However, there are only a\nlimited number of enhancement methods explicitly designed for videos acquired\nin low-light conditions. We propose a Spatio-Temporal Aligned SUNet (STA-SUNet)\nmodel using a Swin Transformer as a backbone to capture low light video\nfeatures and exploit their spatio-temporal correlations. The STA-SUNet model is\ntrained on a novel, fully registered dataset (BVI), which comprises dynamic\nscenes captured under varying light conditions. It is further analysed\ncomparatively against various other models over three test datasets. The model\ndemonstrates superior adaptivity across all datasets, obtaining the highest\nPSNR and SSIM values. It is particularly effective in extreme low-light\nconditions, yielding fairly good visualisation results.\n","authors":["Ruirui Lin","Nantheera Anantrasirichai","Alexandra Malyugina","David Bull"],"pdf_url":"https://arxiv.org/pdf/2403.02408v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03108v3","updated":"2024-04-09T16:31:33Z","published":"2023-07-06T16:27:39Z","title":"DIAGNOSIS: Detecting Unauthorized Data Usages in Text-to-image Diffusion\n Models","summary":" Recent text-to-image diffusion models have shown surprising performance in\ngenerating high-quality images. However, concerns have arisen regarding the\nunauthorized data usage during the training or fine-tuning process. One example\nis when a model trainer collects a set of images created by a particular artist\nand attempts to train a model capable of generating similar images without\nobtaining permission and giving credit to the artist. To address this issue, we\npropose a method for detecting such unauthorized data usage by planting the\ninjected memorization into the text-to-image diffusion models trained on the\nprotected dataset. Specifically, we modify the protected images by adding\nunique contents on these images using stealthy image warping functions that are\nnearly imperceptible to humans but can be captured and memorized by diffusion\nmodels. By analyzing whether the model has memorized the injected content\n(i.e., whether the generated images are processed by the injected\npost-processing function), we can detect models that had illegally utilized the\nunauthorized data. Experiments on Stable Diffusion and VQ Diffusion with\ndifferent model training or fine-tuning methods (i.e, LoRA, DreamBooth, and\nstandard training) demonstrate the effectiveness of our proposed method in\ndetecting unauthorized data usages. Code:\nhttps://github.com/ZhentingWang/DIAGNOSIS.\n","authors":["Zhenting Wang","Chen Chen","Lingjuan Lyu","Dimitris N. Metaxas","Shiqing Ma"],"pdf_url":"https://arxiv.org/pdf/2307.03108v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.06437v1","updated":"2024-04-09T16:28:54Z","published":"2024-04-09T16:28:54Z","title":"Seasonal Fire Prediction using Spatio-Temporal Deep Neural Networks","summary":" With climate change expected to exacerbate fire weather conditions, the\naccurate anticipation of wildfires on a global scale becomes increasingly\ncrucial for disaster mitigation. In this study, we utilize SeasFire, a\ncomprehensive global wildfire dataset with climate, vegetation, oceanic\nindices, and human-related variables, to enable seasonal wildfire forecasting\nwith machine learning. For the predictive analysis, we train deep learning\nmodels with different architectures that capture the spatio-temporal context\nleading to wildfires. Our investigation focuses on assessing the effectiveness\nof these models in predicting the presence of burned areas at varying\nforecasting time horizons globally, extending up to six months into the future,\nand on how different spatial or/and temporal context affects the performance of\nthe models. Our findings demonstrate the great potential of deep learning\nmodels in seasonal fire forecasting; longer input time-series leads to more\nrobust predictions across varying forecasting horizons, while integrating\nspatial information to capture wildfire spatio-temporal dynamics boosts\nperformance. Finally, our results hint that in order to enhance performance at\nlonger forecasting horizons, a larger receptive field spatially needs to be\nconsidered.\n","authors":["Dimitrios Michail","Lefki-Ioanna Panagiotou","Charalampos Davalas","Ioannis Prapas","Spyros Kondylatos","Nikolaos Ioannis Bountos","Ioannis Papoutsis"],"pdf_url":"https://arxiv.org/pdf/2404.06437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06430v1","updated":"2024-04-09T16:23:01Z","published":"2024-04-09T16:23:01Z","title":"pfl-research: simulation framework for accelerating research in Private\n Federated Learning","summary":" Federated learning (FL) is an emerging machine learning (ML) training\nparadigm where clients own their data and collaborate to train a global model,\nwithout revealing any data to the server and other participants. Researchers\ncommonly perform experiments in a simulation environment to quickly iterate on\nideas. However, existing open-source tools do not offer the efficiency required\nto simulate FL on larger and more realistic FL datasets. We introduce\npfl-research, a fast, modular, and easy-to-use Python framework for simulating\nFL. It supports TensorFlow, PyTorch, and non-neural network models, and is\ntightly integrated with state-of-the-art privacy algorithms. We study the speed\nof open-source FL frameworks and show that pfl-research is 7-72$\\times$ faster\nthan alternative open-source frameworks on common cross-device setups. Such\nspeedup will significantly boost the productivity of the FL research community\nand enable testing hypotheses on realistic FL datasets that were previously too\nresource intensive. We release a suite of benchmarks that evaluates an\nalgorithm's overall performance on a diverse set of realistic scenarios. The\ncode is available on GitHub at https://github.com/apple/pfl-research.\n","authors":["Filip Granqvist","Congzheng Song","Áine Cahill","Rogier van Dalen","Martin Pelikan","Yi Sheng Chan","Xiaojun Feng","Natarajan Krishnaswami","Vojta Jina","Mona Chitnis"],"pdf_url":"https://arxiv.org/pdf/2404.06430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06429v1","updated":"2024-04-09T16:20:03Z","published":"2024-04-09T16:20:03Z","title":"Magic-Boost: Boost 3D Generation with Mutli-View Conditioned Diffusion","summary":" Benefiting from the rapid development of 2D diffusion models, 3D content\ncreation has made significant progress recently. One promising solution\ninvolves the fine-tuning of pre-trained 2D diffusion models to harness their\ncapacity for producing multi-view images, which are then lifted into accurate\n3D models via methods like fast-NeRFs or large reconstruction models. However,\nas inconsistency still exists and limited generated resolution, the generation\nresults of such methods still lack intricate textures and complex geometries.\nTo solve this problem, we propose Magic-Boost, a multi-view conditioned\ndiffusion model that significantly refines coarse generative results through a\nbrief period of SDS optimization ($\\sim15$min). Compared to the previous text\nor single image based diffusion models, Magic-Boost exhibits a robust\ncapability to generate images with high consistency from pseudo synthesized\nmulti-view images. It provides precise SDS guidance that well aligns with the\nidentity of the input images, enriching the local detail in both geometry and\ntexture of the initial generative results. Extensive experiments show\nMagic-Boost greatly enhances the coarse inputs and generates high-quality 3D\nassets with rich geometric and textural details. (Project Page:\nhttps://magic-research.github.io/magic-boost/)\n","authors":["Fan Yang","Jianfeng Zhang","Yichun Shi","Bowen Chen","Chenxu Zhang","Huichao Zhang","Xiaofeng Yang","Jiashi Feng","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2404.06429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06425v1","updated":"2024-04-09T16:15:03Z","published":"2024-04-09T16:15:03Z","title":"ZeST: Zero-Shot Material Transfer from a Single Image","summary":" We propose ZeST, a method for zero-shot material transfer to an object in the\ninput image given a material exemplar image. ZeST leverages existing diffusion\nadapters to extract implicit material representation from the exemplar image.\nThis representation is used to transfer the material using pre-trained\ninpainting diffusion model on the object in the input image using depth\nestimates as geometry cue and grayscale object shading as illumination cues.\nThe method works on real images without any training resulting a zero-shot\napproach. Both qualitative and quantitative results on real and synthetic\ndatasets demonstrate that ZeST outputs photorealistic images with transferred\nmaterials. We also show the application of ZeST to perform multiple edits and\nrobust material assignment under different illuminations. Project Page:\nhttps://ttchengab.github.io/zest\n","authors":["Ta-Ying Cheng","Prafull Sharma","Andrew Markham","Niki Trigoni","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2404.06425v1.pdf","comment":"Project Page: https://ttchengab.github.io/zest"},{"id":"http://arxiv.org/abs/2404.06406v1","updated":"2024-04-09T15:54:03Z","published":"2024-04-09T15:54:03Z","title":"Emergent Dynamics in Neural Cellular Automata","summary":" Neural Cellular Automata (NCA) models are trainable variations of traditional\nCellular Automata (CA). Emergent motion in the patterns created by NCA has been\nsuccessfully applied to synthesize dynamic textures. However, the conditions\nrequired for an NCA to display dynamic patterns remain unexplored. Here, we\ninvestigate the relationship between the NCA architecture and the emergent\ndynamics of the trained models. Specifically, we vary the number of channels in\nthe cell state and the number of hidden neurons in the MultiLayer Perceptron\n(MLP), and draw a relationship between the combination of these two variables\nand the motion strength between successive frames. Our analysis reveals that\nthe disparity and proportionality between these two variables have a strong\ncorrelation with the emergent dynamics in the NCA output. We thus propose a\ndesign principle for creating dynamic NCA.\n","authors":["Yitao Xu","Ehsan Pajouheshgar","Sabine Süsstrunk"],"pdf_url":"https://arxiv.org/pdf/2404.06406v1.pdf","comment":"2 pages"},{"id":"http://arxiv.org/abs/2312.09168v3","updated":"2024-04-09T15:47:56Z","published":"2023-12-14T17:34:53Z","title":"DiffusionLight: Light Probes for Free by Painting a Chrome Ball","summary":" We present a simple yet effective technique to estimate lighting in a single\ninput image. Current techniques rely heavily on HDR panorama datasets to train\nneural networks to regress an input with limited field-of-view to a full\nenvironment map. However, these approaches often struggle with real-world,\nuncontrolled settings due to the limited diversity and size of their datasets.\nTo address this problem, we leverage diffusion models trained on billions of\nstandard images to render a chrome ball into the input image. Despite its\nsimplicity, this task remains challenging: the diffusion models often insert\nincorrect or inconsistent objects and cannot readily generate images in HDR\nformat. Our research uncovers a surprising relationship between the appearance\nof chrome balls and the initial diffusion noise map, which we utilize to\nconsistently generate high-quality chrome balls. We further fine-tune an LDR\ndiffusion model (Stable Diffusion XL) with LoRA, enabling it to perform\nexposure bracketing for HDR light estimation. Our method produces convincing\nlight estimates across diverse settings and demonstrates superior\ngeneralization to in-the-wild scenarios.\n","authors":["Pakkapon Phongthawee","Worameth Chinchuthakun","Nontaphat Sinsunthithet","Amit Raj","Varun Jampani","Pramook Khungurn","Supasorn Suwajanakorn"],"pdf_url":"https://arxiv.org/pdf/2312.09168v3.pdf","comment":"CVPR 2024 Oral. For more information and code, please visit our\n website https://diffusionlight.github.io/"},{"id":"http://arxiv.org/abs/2204.03330v2","updated":"2024-04-09T15:44:05Z","published":"2022-04-07T09:56:36Z","title":"Learning Local and Global Temporal Contexts for Video Semantic\n Segmentation","summary":" Contextual information plays a core role for video semantic segmentation\n(VSS). This paper summarizes contexts for VSS in two-fold: local temporal\ncontexts (LTC) which define the contexts from neighboring frames, and global\ntemporal contexts (GTC) which represent the contexts from the whole video. As\nfor LTC, it includes static and motional contexts, corresponding to static and\nmoving content in neighboring frames, respectively. Previously, both static and\nmotional contexts have been studied. However, there is no research about\nsimultaneously learning static and motional contexts (highly complementary).\nHence, we propose a Coarse-to-Fine Feature Mining (CFFM) technique to learn a\nunified presentation of LTC. CFFM contains two parts: Coarse-to-Fine Feature\nAssembling (CFFA) and Cross-frame Feature Mining (CFM). CFFA abstracts static\nand motional contexts, and CFM mines useful information from nearby frames to\nenhance target features. To further exploit more temporal contexts, we propose\nCFFM++ by additionally learning GTC from the whole video. Specifically, we\nuniformly sample certain frames from the video and extract global contextual\nprototypes by k-means. The information within those prototypes is mined by CFM\nto refine target features. Experimental results on popular benchmarks\ndemonstrate that CFFM and CFFM++ perform favorably against state-of-the-art\nmethods. Our code is available at https://github.com/GuoleiSun/VSS-CFFM\n","authors":["Guolei Sun","Yun Liu","Henghui Ding","Min Wu","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2204.03330v2.pdf","comment":"Accepted to TPAMI, an extended version of a paper published in CVPR\n 2022"},{"id":"http://arxiv.org/abs/2401.16110v2","updated":"2024-04-09T15:33:10Z","published":"2024-01-29T12:31:13Z","title":"SGV3D:Towards Scenario Generalization for Vision-based Roadside 3D\n Object Detection","summary":" Roadside perception can greatly increase the safety of autonomous vehicles by\nextending their perception ability beyond the visual range and addressing blind\nspots. However, current state-of-the-art vision-based roadside detection\nmethods possess high accuracy on labeled scenes but have inferior performance\non new scenes. This is because roadside cameras remain stationary after\ninstallation and can only collect data from a single scene, resulting in the\nalgorithm overfitting these roadside backgrounds and camera poses. To address\nthis issue, in this paper, we propose an innovative Scenario Generalization\nFramework for Vision-based Roadside 3D Object Detection, dubbed SGV3D.\nSpecifically, we employ a Background-suppressed Module (BSM) to mitigate\nbackground overfitting in vision-centric pipelines by attenuating background\nfeatures during the 2D to bird's-eye-view projection. Furthermore, by\nintroducing the Semi-supervised Data Generation Pipeline (SSDG) using unlabeled\nimages from new scenes, diverse instance foregrounds with varying camera poses\nare generated, addressing the risk of overfitting specific camera poses. We\nevaluate our method on two large-scale roadside benchmarks. Our method\nsurpasses all previous methods by a significant margin in new scenes, including\n+42.57% for vehicle, +5.87% for pedestrian, and +14.89% for cyclist compared to\nBEVHeight on the DAIR-V2X-I heterologous benchmark. On the larger-scale Rope3D\nheterologous benchmark, we achieve notable gains of 14.48% for car and 12.41%\nfor large vehicle. We aspire to contribute insights on the exploration of\nroadside perception techniques, emphasizing their capability for scenario\ngeneralization. The code will be available at\nhttps://github.com/yanglei18/SGV3D\n","authors":["Lei Yang","Xinyu Zhang","Jun Li","Li Wang","Chuang Zhang","Li Ju","Zhiwei Li","Yang Shen"],"pdf_url":"https://arxiv.org/pdf/2401.16110v2.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.06389v1","updated":"2024-04-09T15:31:48Z","published":"2024-04-09T15:31:48Z","title":"Raster Forge: Interactive Raster Manipulation Library and GUI for Python","summary":" Raster Forge is a Python library and graphical user interface for raster data\nmanipulation and analysis. The tool is focused on remote sensing applications,\nparticularly in wildfire management. It allows users to import, visualize, and\nprocess raster layers for tasks such as image compositing or topographical\nanalysis. For wildfire management, it generates fuel maps using predefined\nmodels. Its impact extends from disaster management to hydrological modeling,\nagriculture, and environmental monitoring. Raster Forge can be a valuable asset\nfor geoscientists and researchers who rely on raster data analysis, enhancing\ngeospatial data processing and visualization across various disciplines.\n","authors":["Afonso Oliveira","Nuno Fachada","João P. Matos-Carvalho"],"pdf_url":"https://arxiv.org/pdf/2404.06389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20330v2","updated":"2024-04-09T15:17:50Z","published":"2024-03-29T17:59:34Z","title":"Are We on the Right Way for Evaluating Large Vision-Language Models?","summary":" Large vision-language models (LVLMs) have recently achieved rapid progress,\nsparking numerous studies to evaluate their multi-modal capabilities. However,\nwe dig into current evaluation works and identify two primary issues: 1) Visual\ncontent is unnecessary for many samples. The answers can be directly inferred\nfrom the questions and options, or the world knowledge embedded in LLMs. This\nphenomenon is prevalent across current benchmarks. For instance, GeminiPro\nachieves 42.9% on the MMMU benchmark without any visual input, and outperforms\nthe random choice baseline across six benchmarks over 24% on average. 2)\nUnintentional data leakage exists in LLM and LVLM training. LLM and LVLM could\nstill answer some visual-necessary questions without visual content, indicating\nthe memorizing of these samples within large-scale training data. For example,\nSphinx-X-MoE gets 43.6% on MMMU without accessing images, surpassing its LLM\nbackbone with 17.9%. Both problems lead to misjudgments of actual multi-modal\ngains and potentially misguide the study of LVLM. To this end, we present\nMMStar, an elite vision-indispensable multi-modal benchmark comprising 1,500\nsamples meticulously selected by humans. MMStar benchmarks 6 core capabilities\nand 18 detailed axes, aiming to evaluate LVLMs' multi-modal capacities with\ncarefully balanced and purified samples. These samples are first roughly\nselected from current benchmarks with an automated pipeline, human review is\nthen involved to ensure each curated sample exhibits visual dependency, minimal\ndata leakage, and requires advanced multi-modal capabilities. Moreover, two\nmetrics are developed to measure data leakage and actual performance gain in\nmulti-modal training. We evaluate 16 leading LVLMs on MMStar to assess their\nmulti-modal capabilities, and on 7 benchmarks with the proposed metrics to\ninvestigate their data leakage and actual multi-modal gain.\n","authors":["Lin Chen","Jinsong Li","Xiaoyi Dong","Pan Zhang","Yuhang Zang","Zehui Chen","Haodong Duan","Jiaqi Wang","Yu Qiao","Dahua Lin","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.20330v2.pdf","comment":"Project page: https://mmstar-benchmark.github.io/"},{"id":"http://arxiv.org/abs/2403.04198v2","updated":"2024-04-09T15:07:08Z","published":"2024-03-07T03:59:47Z","title":"CN-RMA: Combined Network with Ray Marching Aggregation for 3D Indoors\n Object Detection from Multi-view Images","summary":" This paper introduces CN-RMA, a novel approach for 3D indoor object detection\nfrom multi-view images. We observe the key challenge as the ambiguity of image\nand 3D correspondence without explicit geometry to provide occlusion\ninformation. To address this issue, CN-RMA leverages the synergy of 3D\nreconstruction networks and 3D object detection networks, where the\nreconstruction network provides a rough Truncated Signed Distance Function\n(TSDF) and guides image features to vote to 3D space correctly in an end-to-end\nmanner. Specifically, we associate weights to sampled points of each ray\nthrough ray marching, representing the contribution of a pixel in an image to\ncorresponding 3D locations. Such weights are determined by the predicted signed\ndistances so that image features vote only to regions near the reconstructed\nsurface. Our method achieves state-of-the-art performance in 3D object\ndetection from multi-view images, as measured by mAP@0.25 and mAP@0.5 on the\nScanNet and ARKitScenes datasets. The code and models are released at\nhttps://github.com/SerCharles/CN-RMA.\n","authors":["Guanlin Shen","Jingwei Huang","Zhihua Hu","Bin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.04198v2.pdf","comment":"CVPR2024 poster paper, 8 pages of main part, and 4 pages of\n supplementary material"},{"id":"http://arxiv.org/abs/2311.06798v2","updated":"2024-04-09T15:07:02Z","published":"2023-11-12T10:21:04Z","title":"MetaMix: Meta-state Precision Searcher for Mixed-precision Activation\n Quantization","summary":" Mixed-precision quantization of efficient networks often suffer from\nactivation instability encountered in the exploration of bit selections. To\naddress this problem, we propose a novel method called MetaMix which consists\nof bit selection and weight training phases. The bit selection phase iterates\ntwo steps, (1) the mixed-precision-aware weight update, and (2) the bit-search\ntraining with the fixed mixed-precision-aware weights, both of which combined\nreduce activation instability in mixed-precision quantization and contribute to\nfast and high-quality bit selection. The weight training phase exploits the\nweights and step sizes trained in the bit selection phase and fine-tunes them\nthereby offering fast training. Our experiments with efficient and\nhard-to-quantize networks, i.e., MobileNet v2 and v3, and ResNet-18 on ImageNet\nshow that our proposed method pushes the boundary of mixed-precision\nquantization, in terms of accuracy vs. operations, by outperforming both mixed-\nand single-precision SOTA methods.\n","authors":["Han-Byul Kim","Joo Hyung Lee","Sungjoo Yoo","Hong-Seok Kim"],"pdf_url":"https://arxiv.org/pdf/2311.06798v2.pdf","comment":"Proc. The 38th Annual AAAI Conference on Artificial Intelligence\n (AAAI)"},{"id":"http://arxiv.org/abs/2404.06369v1","updated":"2024-04-09T15:05:48Z","published":"2024-04-09T15:05:48Z","title":"VISION2UI: A Real-World Dataset with Layout for Code Generation from UI\n Designs","summary":" Automatically generating UI code from webpage design visions can\nsignificantly alleviate the burden of developers, enabling beginner developers\nor designers to directly generate Web pages from design diagrams. Currently,\nprior research has accomplished the objective of generating UI code from\nrudimentary design visions or sketches through designing deep neural networks.\nInspired by the groundbreaking advancements achieved by Multimodal Large\nLanguage Models (MLLMs), the automatic generation of UI code from high-fidelity\ndesign images is now emerging as a viable possibility. Nevertheless, our\ninvestigation reveals that existing MLLMs are hampered by the scarcity of\nauthentic, high-quality, and large-scale datasets, leading to unsatisfactory\nperformance in automated UI code generation. To mitigate this gap, we present a\nnovel dataset, termed VISION2UI, extracted from real-world scenarios, augmented\nwith comprehensive layout information, tailored specifically for finetuning\nMLLMs in UI code generation. Specifically, this dataset is derived through a\nseries of operations, encompassing collecting, cleaning, and filtering of the\nopen-source Common Crawl dataset. In order to uphold its quality, a neural\nscorer trained on labeled samples is utilized to refine the data, retaining\nhigher-quality instances. Ultimately, this process yields a dataset comprising\n2,000 (Much more is coming soon) parallel samples encompassing design visions\nand UI code. The dataset is available at\nhttps://huggingface.co/datasets/xcodemind/vision2ui.\n","authors":["Yi Gui","Zhen Li","Yao Wan","Yemin Shi","Hongyu Zhang","Yi Su","Shaoling Dong","Xing Zhou","Wenbin Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.06369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06365v1","updated":"2024-04-09T15:02:01Z","published":"2024-04-09T15:02:01Z","title":"Dynamic Resolution Guidance for Facial Expression Recognition","summary":" Facial expression recognition (FER) is vital for human-computer interaction\nand emotion analysis, yet recognizing expressions in low-resolution images\nremains challenging. This paper introduces a practical method called Dynamic\nResolution Guidance for Facial Expression Recognition (DRGFER) to effectively\nrecognize facial expressions in images with varying resolutions without\ncompromising FER model accuracy. Our framework comprises two main components:\nthe Resolution Recognition Network (RRN) and the Multi-Resolution Adaptation\nFacial Expression Recognition Network (MRAFER). The RRN determines image\nresolution, outputs a binary vector, and the MRAFER assigns images to suitable\nfacial expression recognition networks based on resolution. We evaluated DRGFER\non widely-used datasets RAFDB and FERPlus, demonstrating that our method\nretains optimal model performance at each resolution and outperforms\nalternative resolution approaches. The proposed framework exhibits robustness\nagainst resolution variations and facial expressions, offering a promising\nsolution for real-world applications.\n","authors":["Jie Ou","Xu Li","Tianxiang Jiang","Yuanlun Xie"],"pdf_url":"https://arxiv.org/pdf/2404.06365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06362v1","updated":"2024-04-09T14:56:34Z","published":"2024-04-09T14:56:34Z","title":"Test-Time Adaptation with SaLIP: A Cascade of SAM and CLIP for Zero shot\n Medical Image Segmentation","summary":" The Segment Anything Model (SAM) and CLIP are remarkable vision foundation\nmodels (VFMs). SAM, a prompt driven segmentation model, excels in segmentation\ntasks across diverse domains, while CLIP is renowned for its zero shot\nrecognition capabilities. However, their unified potential has not yet been\nexplored in medical image segmentation. To adapt SAM to medical imaging,\nexisting methods primarily rely on tuning strategies that require extensive\ndata or prior prompts tailored to the specific task, making it particularly\nchallenging when only a limited number of data samples are available. This work\npresents an in depth exploration of integrating SAM and CLIP into a unified\nframework for medical image segmentation. Specifically, we propose a simple\nunified framework, SaLIP, for organ segmentation. Initially, SAM is used for\npart based segmentation within the image, followed by CLIP to retrieve the mask\ncorresponding to the region of interest (ROI) from the pool of SAM generated\nmasks. Finally, SAM is prompted by the retrieved ROI to segment a specific\norgan. Thus, SaLIP is training and fine tuning free and does not rely on domain\nexpertise or labeled data for prompt engineering. Our method shows substantial\nenhancements in zero shot segmentation, showcasing notable improvements in DICE\nscores across diverse segmentation tasks like brain (63.46%), lung (50.11%),\nand fetal head (30.82%), when compared to un prompted SAM. Code and text\nprompts will be available online.\n","authors":["Sidra Aleem","Fangyijie Wang","Mayug Maniparambil","Eric Arazo","Julia Dietlmeier","Kathleen Curran","Noel E. O'Connor","Suzanne Little"],"pdf_url":"https://arxiv.org/pdf/2404.06362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06353v1","updated":"2024-04-09T14:44:12Z","published":"2024-04-09T14:44:12Z","title":"High Noise Scheduling is a Must","summary":" Consistency models possess high capabilities for image generation, advancing\nsampling steps to a single step through their advanced techniques. Current\nadvancements move one step forward consistency training techniques and\neliminates the limitation of distillation training. Even though the proposed\ncurriculum and noise scheduling in improved training techniques yield better\nresults than basic consistency models, it lacks well balanced noise\ndistribution and its consistency between curriculum. In this study, it is\ninvestigated the balance between high and low noise levels in noise\ndistribution and offered polynomial noise distribution to maintain the\nstability. This proposed polynomial noise distribution is also supported with a\npredefined Karras noises to prevent unique noise levels arises with Karras\nnoise generation algorithm. Furthermore, by elimination of learned noisy steps\nwith a curriculum based on sinusoidal function increase the performance of the\nmodel in denoising. To make a fair comparison with the latest released\nconsistency model training techniques, experiments are conducted with same\nhyper-parameters except curriculum and noise distribution. The models utilized\nduring experiments are determined with low depth to prove the robustness of our\nproposed technique. The results show that the polynomial noise distribution\noutperforms the model trained with log-normal noise distribution, yielding a\n33.54 FID score after 100,000 training steps with constant discretization\nsteps. Additionally, the implementation of a sinusoidal-based curriculum\nenhances denoising performance, resulting in a FID score of 30.48.\n","authors":["Mahmut S. Gokmen","Cody Bumgardner","Jie Zhang","Ge Wang","Jin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06352v1","updated":"2024-04-09T14:43:19Z","published":"2024-04-09T14:43:19Z","title":"DaF-BEVSeg: Distortion-aware Fisheye Camera based Bird's Eye View\n Segmentation with Occlusion Reasoning","summary":" Semantic segmentation is an effective way to perform scene understanding.\nRecently, segmentation in 3D Bird's Eye View (BEV) space has become popular as\nits directly used by drive policy. However, there is limited work on BEV\nsegmentation for surround-view fisheye cameras, commonly used in commercial\nvehicles. As this task has no real-world public dataset and existing synthetic\ndatasets do not handle amodal regions due to occlusion, we create a synthetic\ndataset using the Cognata simulator comprising diverse road types, weather, and\nlighting conditions. We generalize the BEV segmentation to work with any camera\nmodel; this is useful for mixing diverse cameras. We implement a baseline by\napplying cylindrical rectification on the fisheye images and using a standard\nLSS-based BEV segmentation model. We demonstrate that we can achieve better\nperformance without undistortion, which has the adverse effects of increased\nruntime due to pre-processing, reduced field-of-view, and resampling artifacts.\nFurther, we introduce a distortion-aware learnable BEV pooling strategy that is\nmore effective for the fisheye cameras. We extend the model with an occlusion\nreasoning module, which is critical for estimating in BEV space. Qualitative\nperformance of DaF-BEVSeg is showcased in the video at\nhttps://streamable.com/ge4v51.\n","authors":["Senthil Yogamani","David Unger","Venkatraman Narayanan","Varun Ravi Kumar"],"pdf_url":"https://arxiv.org/pdf/2404.06352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06351v1","updated":"2024-04-09T14:42:31Z","published":"2024-04-09T14:42:31Z","title":"HPNet: Dynamic Trajectory Forecasting with Historical Prediction\n Attention","summary":" Predicting the trajectories of road agents is essential for autonomous\ndriving systems. The recent mainstream methods follow a static paradigm, which\npredicts the future trajectory by using a fixed duration of historical frames.\nThese methods make the predictions independently even at adjacent time steps,\nwhich leads to potential instability and temporal inconsistency. As successive\ntime steps have largely overlapping historical frames, their forecasting should\nhave intrinsic correlation, such as overlapping predicted trajectories should\nbe consistent, or be different but share the same motion goal depending on the\nroad situation. Motivated by this, in this work, we introduce HPNet, a novel\ndynamic trajectory forecasting method. Aiming for stable and accurate\ntrajectory forecasting, our method leverages not only historical frames\nincluding maps and agent states, but also historical predictions. Specifically,\nwe newly design a Historical Prediction Attention module to automatically\nencode the dynamic relationship between successive predictions. Besides, it\nalso extends the attention range beyond the currently visible window\nbenefitting from the use of historical predictions. The proposed Historical\nPrediction Attention together with the Agent Attention and Mode Attention is\nfurther formulated as the Triple Factorized Attention module, serving as the\ncore design of HPNet.Experiments on the Argoverse and INTERACTION datasets show\nthat HPNet achieves state-of-the-art performance, and generates accurate and\nstable future trajectories. Our code are available at\nhttps://github.com/XiaolongTang23/HPNet.\n","authors":["Xiaolong Tang","Meina Kan","Shiguang Shan","Zhilong Ji","Jinfeng Bai","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06351v1.pdf","comment":"accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.06350v1","updated":"2024-04-09T14:40:54Z","published":"2024-04-09T14:40:54Z","title":"Rolling Shutter Correction with Intermediate Distortion Flow Estimation","summary":" This paper proposes to correct the rolling shutter (RS) distorted images by\nestimating the distortion flow from the global shutter (GS) to RS directly.\nExisting methods usually perform correction using the undistortion flow from\nthe RS to GS. They initially predict the flow from consecutive RS frames,\nsubsequently rescaling it as the displacement fields from the RS frame to the\nunderlying GS image using time-dependent scaling factors. Following this,\nRS-aware forward warping is employed to convert the RS image into its GS\ncounterpart. Nevertheless, this strategy is prone to two shortcomings. First,\nthe undistortion flow estimation is rendered inaccurate by merely linear\nscaling the flow, due to the complex non-linear motion nature. Second, RS-aware\nforward warping often results in unavoidable artifacts. To address these\nlimitations, we introduce a new framework that directly estimates the\ndistortion flow and rectifies the RS image with the backward warping operation.\nMore specifically, we first propose a global correlation-based flow attention\nmechanism to estimate the initial distortion flow and GS feature jointly, which\nare then refined by the following coarse-to-fine decoder layers. Additionally,\na multi-distortion flow prediction strategy is integrated to mitigate the issue\nof inaccurate flow estimation further. Experimental results validate the\neffectiveness of the proposed method, which outperforms state-of-the-art\napproaches on various benchmarks while maintaining high efficiency. The project\nis available at \\url{https://github.com/ljzycmd/DFRSC}.\n","authors":["Mingdeng Cao","Sidi Yang","Yujiu Yang","Yinqiang Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.06350v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2403.20035v2","updated":"2024-04-09T14:29:10Z","published":"2024-03-29T08:03:42Z","title":"UltraLight VM-UNet: Parallel Vision Mamba Significantly Reduces\n Parameters for Skin Lesion Segmentation","summary":" Traditionally for improving the segmentation performance of models, most\napproaches prefer to use adding more complex modules. And this is not suitable\nfor the medical field, especially for mobile medical devices, where\ncomputationally loaded models are not suitable for real clinical environments\ndue to computational resource constraints. Recently, state-space models (SSMs),\nrepresented by Mamba, have become a strong competitor to traditional CNNs and\nTransformers. In this paper, we deeply explore the key elements of parameter\ninfluence in Mamba and propose an UltraLight Vision Mamba UNet (UltraLight\nVM-UNet) based on this. Specifically, we propose a method for processing\nfeatures in parallel Vision Mamba, named PVM Layer, which achieves excellent\nperformance with the lowest computational load while keeping the overall number\nof processing channels constant. We conducted comparisons and ablation\nexperiments with several state-of-the-art lightweight models on three skin\nlesion public datasets and demonstrated that the UltraLight VM-UNet exhibits\nthe same strong performance competitiveness with parameters of only 0.049M and\nGFLOPs of 0.060. In addition, this study deeply explores the key elements of\nparameter influence in Mamba, which will lay a theoretical foundation for Mamba\nto possibly become a new mainstream module for lightweighting in the future.\nThe code is available from https://github.com/wurenkai/UltraLight-VM-UNet .\n","authors":["Renkai Wu","Yinghao Liu","Pengchen Liang","Qing Chang"],"pdf_url":"https://arxiv.org/pdf/2403.20035v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06337v1","updated":"2024-04-09T14:22:50Z","published":"2024-04-09T14:22:50Z","title":"Matching 2D Images in 3D: Metric Relative Pose from Metric\n Correspondences","summary":" Given two images, we can estimate the relative camera pose between them by\nestablishing image-to-image correspondences. Usually, correspondences are\n2D-to-2D and the pose we estimate is defined only up to scale. Some\napplications, aiming at instant augmented reality anywhere, require\nscale-metric pose estimates, and hence, they rely on external depth estimators\nto recover the scale. We present MicKey, a keypoint matching pipeline that is\nable to predict metric correspondences in 3D camera space. By learning to match\n3D coordinates across images, we are able to infer the metric relative pose\nwithout depth measurements. Depth measurements are also not required for\ntraining, nor are scene reconstructions or image overlap information. MicKey is\nsupervised only by pairs of images and their relative poses. MicKey achieves\nstate-of-the-art performance on the Map-Free Relocalisation benchmark while\nrequiring less supervision than competing approaches.\n","authors":["Axel Barroso-Laguna","Sowmya Munukutla","Victor Adrian Prisacariu","Eric Brachmann"],"pdf_url":"https://arxiv.org/pdf/2404.06337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04818v4","updated":"2024-04-09T14:15:32Z","published":"2023-11-08T16:42:14Z","title":"Cross-Silo Federated Learning Across Divergent Domains with Iterative\n Parameter Alignment","summary":" Learning from the collective knowledge of data dispersed across private\nsources can provide neural networks with enhanced generalization capabilities.\nFederated learning, a method for collaboratively training a machine learning\nmodel across remote clients, achieves this by combining client models via the\norchestration of a central server. However, current approaches face two\ncritical limitations: i) they struggle to converge when client domains are\nsufficiently different, and ii) current aggregation techniques produce an\nidentical global model for each client. In this work, we address these issues\nby reformulating the typical federated learning setup: rather than learning a\nsingle global model, we learn N models each optimized for a common objective.\nTo achieve this, we apply a weighted distance minimization to model parameters\nshared in a peer-to-peer topology. The resulting framework, Iterative Parameter\nAlignment, applies naturally to the cross-silo setting, and has the following\nproperties: (i) a unique solution for each participant, with the option to\nglobally converge each model in the federation, and (ii) an optional\nearly-stopping mechanism to elicit fairness among peers in collaborative\nlearning settings. These characteristics jointly provide a flexible new\nframework for iteratively learning from peer models trained on disparate\ndatasets. We find that the technique achieves competitive results on a variety\nof data partitions compared to state-of-the-art approaches. Further, we show\nthat the method is robust to divergent domains (i.e. disjoint classes across\npeers) where existing approaches struggle.\n","authors":["Matt Gorbett","Hossein Shirazi","Indrakshi Ray"],"pdf_url":"https://arxiv.org/pdf/2311.04818v4.pdf","comment":"Published at IEEE Big Data 2023"},{"id":"http://arxiv.org/abs/2402.18078v2","updated":"2024-04-09T14:12:02Z","published":"2024-02-28T06:07:07Z","title":"Coarse-to-Fine Latent Diffusion for Pose-Guided Person Image Synthesis","summary":" Diffusion model is a promising approach to image generation and has been\nemployed for Pose-Guided Person Image Synthesis (PGPIS) with competitive\nperformance. While existing methods simply align the person appearance to the\ntarget pose, they are prone to overfitting due to the lack of a high-level\nsemantic understanding on the source person image. In this paper, we propose a\nnovel Coarse-to-Fine Latent Diffusion (CFLD) method for PGPIS. In the absence\nof image-caption pairs and textual prompts, we develop a novel training\nparadigm purely based on images to control the generation process of a\npre-trained text-to-image diffusion model. A perception-refined decoder is\ndesigned to progressively refine a set of learnable queries and extract\nsemantic understanding of person images as a coarse-grained prompt. This allows\nfor the decoupling of fine-grained appearance and pose information controls at\ndifferent stages, and thus circumventing the potential overfitting problem. To\ngenerate more realistic texture details, a hybrid-granularity attention module\nis proposed to encode multi-scale fine-grained appearance features as bias\nterms to augment the coarse-grained prompt. Both quantitative and qualitative\nexperimental results on the DeepFashion benchmark demonstrate the superiority\nof our method over the state of the arts for PGPIS. Code is available at\nhttps://github.com/YanzuoLu/CFLD.\n","authors":["Yanzuo Lu","Manlin Zhang","Andy J Ma","Xiaohua Xie","Jian-Huang Lai"],"pdf_url":"https://arxiv.org/pdf/2402.18078v2.pdf","comment":"Accepted by CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2401.01558v2","updated":"2024-04-09T13:59:18Z","published":"2024-01-03T06:18:30Z","title":"One-Step Late Fusion Multi-view Clustering with Compressed Subspace","summary":" Late fusion multi-view clustering (LFMVC) has become a rapidly growing class\nof methods in the multi-view clustering (MVC) field, owing to its excellent\ncomputational speed and clustering performance. One bottleneck faced by\nexisting late fusion methods is that they are usually aligned to the average\nkernel function, which makes the clustering performance highly dependent on the\nquality of datasets. Another problem is that they require subsequent k-means\nclustering after obtaining the consensus partition matrix to get the final\ndiscrete labels, and the resulting separation of the label learning and cluster\nstructure optimization processes limits the integrity of these models. To\naddress the above issues, we propose an integrated framework named One-Step\nLate Fusion Multi-view Clustering with Compressed Subspace (OS-LFMVC-CS).\nSpecifically, we use the consensus subspace to align the partition matrix while\noptimizing the partition fusion, and utilize the fused partition matrix to\nguide the learning of discrete labels. A six-step iterative optimization\napproach with verified convergence is proposed. Sufficient experiments on\nmultiple datasets validate the effectiveness and efficiency of our proposed\nmethod.\n","authors":["Qiyuan Ou","Pei Zhang","Sihang Zhou","En Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.01558v2.pdf","comment":"Accepted by ICASSP2024"},{"id":"http://arxiv.org/abs/2403.17881v2","updated":"2024-04-09T13:56:06Z","published":"2024-03-26T17:12:34Z","title":"Deepfake Generation and Detection: A Benchmark and Survey","summary":" In addition to the advancements in deepfake generation, corresponding\ndetection technologies need to continuously evolve to regulate the potential\nmisuse of deepfakes, such as for privacy invasion and phishing attacks. This\nsurvey comprehensively reviews the latest developments in deepfake generation\nand detection, summarizing and analyzing the current state of the art in this\nrapidly evolving field. We first unify task definitions, comprehensively\nintroduce datasets and metrics, and discuss the development of generation and\ndetection technology frameworks. Then, we discuss the development of several\nrelated sub-fields and focus on researching four mainstream deepfake fields:\npopular face swap, face reenactment, talking face generation, and facial\nattribute editing, as well as foreign detection. Subsequently, we\ncomprehensively benchmark representative methods on popular datasets for each\nfield, fully evaluating the latest and influential works published in top\nconferences/journals. Finally, we analyze the challenges and future research\ndirections of the discussed fields. We closely follow the latest developments\nin https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection.\n","authors":["Gan Pei","Jiangning Zhang","Menghan Hu","Zhenyu Zhang","Chengjie Wang","Yunsheng Wu","Guangtao Zhai","Jian Yang","Chunhua Shen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.17881v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05334v3","updated":"2024-04-09T13:54:48Z","published":"2023-09-11T09:32:45Z","title":"MultIOD: Rehearsal-free Multihead Incremental Object Detector","summary":" Class-Incremental learning (CIL) refers to the ability of artificial agents\nto integrate new classes as they appear in a stream. It is particularly\ninteresting in evolving environments where agents have limited access to memory\nand computational resources. The main challenge of incremental learning is\ncatastrophic forgetting, the inability of neural networks to retain past\nknowledge when learning a new one. Unfortunately, most existing\nclass-incremental methods for object detection are applied to two-stage\nalgorithms such as Faster-RCNN, and rely on rehearsal memory to retain past\nknowledge. We argue that those are not suitable in resource-limited\nenvironments, and more effort should be dedicated to anchor-free and\nrehearsal-free object detection. In this paper, we propose MultIOD, a\nclass-incremental object detector based on CenterNet. Our contributions are:\n(1) we propose a multihead feature pyramid and multihead detection architecture\nto efficiently separate class representations, (2) we employ transfer learning\nbetween classes learned initially and those learned incrementally to tackle\ncatastrophic forgetting, and (3) we use a class-wise non-max-suppression as a\npost-processing technique to remove redundant boxes. Results show that our\nmethod outperforms state-of-the-art methods on two Pascal VOC datasets, while\nonly saving the model in its current state, contrary to other\ndistillation-based counterparts.\n","authors":["Eden Belouadah","Arnaud Dapogny","Kevin Bailly"],"pdf_url":"https://arxiv.org/pdf/2309.05334v3.pdf","comment":"Accepted at the archival track of the Workshop on Continual Learning\n in Computer Vision (CVPR 2024)"},{"id":"http://arxiv.org/abs/2401.17053v3","updated":"2024-04-09T13:47:18Z","published":"2024-01-30T14:34:19Z","title":"BlockFusion: Expandable 3D Scene Generation using Latent Tri-plane\n Extrapolation","summary":" We present BlockFusion, a diffusion-based model that generates 3D scenes as\nunit blocks and seamlessly incorporates new blocks to extend the scene.\nBlockFusion is trained using datasets of 3D blocks that are randomly cropped\nfrom complete 3D scene meshes. Through per-block fitting, all training blocks\nare converted into the hybrid neural fields: with a tri-plane containing the\ngeometry features, followed by a Multi-layer Perceptron (MLP) for decoding the\nsigned distance values. A variational auto-encoder is employed to compress the\ntri-planes into the latent tri-plane space, on which the denoising diffusion\nprocess is performed. Diffusion applied to the latent representations allows\nfor high-quality and diverse 3D scene generation. To expand a scene during\ngeneration, one needs only to append empty blocks to overlap with the current\nscene and extrapolate existing latent tri-planes to populate new blocks. The\nextrapolation is done by conditioning the generation process with the feature\nsamples from the overlapping tri-planes during the denoising iterations. Latent\ntri-plane extrapolation produces semantically and geometrically meaningful\ntransitions that harmoniously blend with the existing scene. A 2D layout\nconditioning mechanism is used to control the placement and arrangement of\nscene elements. Experimental results indicate that BlockFusion is capable of\ngenerating diverse, geometrically consistent and unbounded large 3D scenes with\nunprecedented high-quality shapes in both indoor and outdoor scenarios.\n","authors":["Zhennan Wu","Yang Li","Han Yan","Taizhang Shang","Weixuan Sun","Senbo Wang","Ruikai Cui","Weizhe Liu","Hiroyuki Sato","Hongdong Li","Pan Ji"],"pdf_url":"https://arxiv.org/pdf/2401.17053v3.pdf","comment":"Video: https://www.youtube.com/watch?v=PxIBtd6G0mA"},{"id":"http://arxiv.org/abs/2403.03309v4","updated":"2024-04-09T13:44:54Z","published":"2024-03-05T20:21:49Z","title":"Learning Zero-Shot Material States Segmentation, by Implanting Natural\n Image Patterns in Synthetic Data","summary":" Visual understanding and segmentation of materials and their states is\nfundamental to understanding the physical world. The myriad textures, shapes,\nand often blurry boundaries formed by materials make this task particularly\nhard to generalize. Whether it's identifying wet regions of a surface, minerals\nin rocks, infected regions in plants, or pollution in water, each material\nstate has its own unique form. For neural nets to learn general class-agnostic\nmaterial segmentation, it is necessary to first collect and annotate data that\ncaptures this complexity. Collecting and manually annotating real-world images\nis limited by the cost and precision of manual labor. In contrast, synthetic\nCGI data is highly accurate and almost cost-free, but fails to replicate the\nvast diversity of the material world. This work offers a method to bridge this\ncrucial gap by implanting patterns extracted from real-world images in\nsynthetic data. Hence, patterns automatically collected from natural images are\nused to map materials into synthetic scenes. This unsupervised approach allows\nthe generated data to capture the vast complexity of the real world while\nmaintaining the precision and scale of synthetic data. We also present the\nfirst general benchmark for zero-shot material state segmentation. The\nbenchmark contains a wide range of real-world images of material states, like\nfood, rocks, construction, plants, liquids, and many others, each in various\nstates (wet/dry/stained/cooked/burned/worn/rusted/sediment/foam, etc.). The\nannotation includes both partial similarity between regions with similar but\nnot identical materials, and hard segmentation of only points in the exact same\nmaterial state. We show that net trains on MatSeg significantly outperform\nexisting state-of-the-art methods on this task. The dataset, code, and trained\nmodel are available\n","authors":["Sagi Eppel","Jolina Li","Manuel Drehwald","Alan Aspuru-Guzik"],"pdf_url":"https://arxiv.org/pdf/2403.03309v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18171v5","updated":"2024-04-09T13:42:07Z","published":"2023-05-29T16:02:09Z","title":"Improved Probabilistic Image-Text Representations","summary":" Image-Text Matching (ITM) task, a fundamental vision-language (VL) task,\nsuffers from the inherent ambiguity arising from multiplicity and imperfect\nannotations. Deterministic functions are not sufficiently powerful to capture\nambiguity, prompting the exploration of probabilistic embeddings to tackle the\nchallenge. However, the existing probabilistic ITM approach encounters two key\nshortcomings; the burden of heavy computations due to the Monte Carlo\napproximation, and the loss saturation issue in the face of abundant false\nnegatives. To overcome the issues, this paper presents an improved\nProbabilistic Cross-Modal Embeddings (named PCME++) by introducing a new\nprobabilistic distance with a closed-form solution. In addition, two\noptimization techniques are proposed to enhance PCME++ further: first, the\nincorporation of pseudo-positives to prevent the negative effect under massive\nfalse negatives; second, mixed sample data augmentation for probabilistic\nmatching. Experimental results on MS-COCO Caption and two extended benchmarks,\nCxC and ECCV Caption, demonstrate the effectiveness of PCME++ compared to\nstate-of-the-art ITM methods. The robustness of PCME++ is also evaluated under\nnoisy image-text correspondences. In addition, the potential applicability of\nPCME++ in automatic prompt-filtering for zero-shot classification is shown. The\ncode is available at https://github.com/naver-ai/pcmepp\n","authors":["Sanghyuk Chun"],"pdf_url":"https://arxiv.org/pdf/2305.18171v5.pdf","comment":"ICLR 2024 camera-ready; Code: https://github.com/naver-ai/pcmepp.\n Project page: https://naver-ai.github.io/pcmepp/. 30 pages, 2.2 MB"},{"id":"http://arxiv.org/abs/2404.06309v1","updated":"2024-04-09T13:39:37Z","published":"2024-04-09T13:39:37Z","title":"Audio-Visual Generalized Zero-Shot Learning using Pre-Trained Large\n Multi-Modal Models","summary":" Audio-visual zero-shot learning methods commonly build on features extracted\nfrom pre-trained models, e.g. video or audio classification models. However,\nexisting benchmarks predate the popularization of large multi-modal models,\nsuch as CLIP and CLAP. In this work, we explore such large pre-trained models\nto obtain features, i.e. CLIP for visual features, and CLAP for audio features.\nFurthermore, the CLIP and CLAP text encoders provide class label embeddings\nwhich are combined to boost the performance of the system. We propose a simple\nyet effective model that only relies on feed-forward neural networks,\nexploiting the strong generalization capabilities of the new audio, visual and\ntextual features. Our framework achieves state-of-the-art performance on\nVGGSound-GZSL, UCF-GZSL, and ActivityNet-GZSL with our new features. Code and\ndata available at: https://github.com/dkurzend/ClipClap-GZSL.\n","authors":["David Kurzendörfer","Otniel-Bogdan Mercea","A. Sophia Koepke","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2404.06309v1.pdf","comment":"CVPRw 2024 (L3D-IVU)"},{"id":"http://arxiv.org/abs/2309.14265v2","updated":"2024-04-09T13:33:30Z","published":"2023-09-25T16:23:49Z","title":"Industrial Application of 6D Pose Estimation for Robotic Manipulation in\n Automotive Internal Logistics","summary":" Despite the advances in robotics a large proportion of the of parts handling\ntasks in the automotive industry's internal logistics are not automated but\nstill performed by humans. A key component to competitively automate these\nprocesses is a 6D pose estimation that can handle a large number of different\nparts, is adaptable to new parts with little manual effort, and is sufficiently\naccurate and robust with respect to industry requirements. In this context, the\nquestion arises as to the current status quo with respect to these measures. To\naddress this we built a representative 6D pose estimation pipeline with\nstate-of-the-art components from economically scalable real to synthetic data\ngeneration to pose estimators and evaluated it on automotive parts with regards\nto a realistic sequencing process. We found that using the data generation\napproaches, the performance of the trained 6D pose estimators are promising,\nbut do not meet industry requirements. We reveal that the reason for this is\nthe inability of the estimators to provide reliable uncertainties for their\nposes, rather than the ability of to provide sufficiently accurate poses. In\nthis context we further analyzed how RGB- and RGB-D-based approaches compare\nagainst this background and show that they are differently vulnerable to the\ndomain gap induced by synthetic data.\n","authors":["Philipp Quentin","Dino Knoll","Daniel Goehring"],"pdf_url":"https://arxiv.org/pdf/2309.14265v2.pdf","comment":"Accepted for publication at IEEE International Conference on\n Automation Science and Engineering (CASE 2023)"},{"id":"http://arxiv.org/abs/2212.04227v2","updated":"2024-04-09T13:30:15Z","published":"2022-12-08T12:20:35Z","title":"Self-training via Metric Learning for Source-Free Domain Adaptation of\n Semantic Segmentation","summary":" Unsupervised source-free domain adaptation methods aim to train a model for\nthe target domain utilizing a pretrained source-domain model and unlabeled\ntarget-domain data, particularly when accessibility to source data is\nrestricted due to intellectual property or privacy concerns. Traditional\nmethods usually use self-training with pseudo-labeling, which is often\nsubjected to thresholding based on prediction confidence. However, such\nthresholding limits the effectiveness of self-training due to insufficient\nsupervision. This issue becomes more severe in a source-free setting, where\nsupervision comes solely from the predictions of the pre-trained source model.\nIn this study, we propose a novel approach by incorporating a mean-teacher\nmodel, wherein the student network is trained using all predictions from the\nteacher network. Instead of employing thresholding on predictions, we introduce\na method to weight the gradients calculated from pseudo-labels based on the\nreliability of the teacher's predictions. To assess reliability, we introduce a\nnovel approach using proxy-based metric learning. Our method is evaluated in\nsynthetic-to-real and cross-city scenarios, demonstrating superior performance\ncompared to existing state-of-the-art methods.\n","authors":["Ibrahim Batuhan Akkaya","Ugur Halici"],"pdf_url":"https://arxiv.org/pdf/2212.04227v2.pdf","comment":"This paper is under consideration at Computer Vision and Image\n Understanding"},{"id":"http://arxiv.org/abs/2404.06294v1","updated":"2024-04-09T13:19:43Z","published":"2024-04-09T13:19:43Z","title":"Fortifying Fully Convolutional Generative Adversarial Networks for Image\n Super-Resolution Using Divergence Measures","summary":" Super-Resolution (SR) is a time-hallowed image processing problem that aims\nto improve the quality of a Low-Resolution (LR) sample up to the standard of\nits High-Resolution (HR) counterpart. We aim to address this by introducing\nSuper-Resolution Generator (SuRGe), a fully-convolutional Generative\nAdversarial Network (GAN)-based architecture for SR. We show that distinct\nconvolutional features obtained at increasing depths of a GAN generator can be\noptimally combined by a set of learnable convex weights to improve the quality\nof generated SR samples. In the process, we employ the Jensen-Shannon and the\nGromov-Wasserstein losses respectively between the SR-HR and LR-SR pairs of\ndistributions to further aid the generator of SuRGe to better exploit the\navailable information in an attempt to improve SR. Moreover, we train the\ndiscriminator of SuRGe with the Wasserstein loss with gradient penalty, to\nprimarily prevent mode collapse. The proposed SuRGe, as an end-to-end GAN\nworkflow tailor-made for super-resolution, offers improved performance while\nmaintaining low inference time. The efficacy of SuRGe is substantiated by its\nsuperior performance compared to 18 state-of-the-art contenders on 10 benchmark\ndatasets.\n","authors":["Arkaprabha Basu","Kushal Bose","Sankha Subhra Mullick","Anish Chakrabarty","Swagatam Das"],"pdf_url":"https://arxiv.org/pdf/2404.06294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02730v3","updated":"2024-04-09T13:18:22Z","published":"2023-07-06T02:30:56Z","title":"Fine-grained Action Analysis: A Multi-modality and Multi-task Dataset of\n Figure Skating","summary":" The fine-grained action analysis of the existing action datasets is\nchallenged by insufficient action categories, low fine granularities, limited\nmodalities, and tasks. In this paper, we propose a Multi-modality and\nMulti-task dataset of Figure Skating (MMFS) which was collected from the World\nFigure Skating Championships. MMFS, which possesses action recognition and\naction quality assessment, captures RGB, skeleton, and is collected the score\nof actions from 11671 clips with 256 categories including spatial and temporal\nlabels. The key contributions of our dataset fall into three aspects as\nfollows. (1) Independently spatial and temporal categories are first proposed\nto further explore fine-grained action recognition and quality assessment. (2)\nMMFS first introduces the skeleton modality for complex fine-grained action\nquality assessment. (3) Our multi-modality and multi-task dataset encourage\nmore action analysis models. To benchmark our dataset, we adopt RGB-based and\nskeleton-based baseline methods for action recognition and action quality\nassessment.\n","authors":["Sheng-Lan Liu","Yu-Ning Ding","Gang Yan","Si-Fan Zhang","Jin-Rong Zhang","Wen-Yue Chen","Xue-Hai Xu"],"pdf_url":"https://arxiv.org/pdf/2307.02730v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06287v1","updated":"2024-04-09T13:13:24Z","published":"2024-04-09T13:13:24Z","title":"Counterfactual Reasoning for Multi-Label Image Classification via\n Patching-Based Training","summary":" The key to multi-label image classification (MLC) is to improve model\nperformance by leveraging label correlations. Unfortunately, it has been shown\nthat overemphasizing co-occurrence relationships can cause the overfitting\nissue of the model, ultimately leading to performance degradation. In this\npaper, we provide a causal inference framework to show that the correlative\nfeatures caused by the target object and its co-occurring objects can be\nregarded as a mediator, which has both positive and negative impacts on model\npredictions. On the positive side, the mediator enhances the recognition\nperformance of the model by capturing co-occurrence relationships; on the\nnegative side, it has the harmful causal effect that causes the model to make\nan incorrect prediction for the target object, even when only co-occurring\nobjects are present in an image. To address this problem, we propose a\ncounterfactual reasoning method to measure the total direct effect, achieved by\nenhancing the direct effect caused only by the target object. Due to the\nunknown location of the target object, we propose patching-based training and\ninference to accomplish this goal, which divides an image into multiple patches\nand identifies the pivot patch that contains the target object. Experimental\nresults on multiple benchmark datasets with diverse configurations validate\nthat the proposed method can achieve state-of-the-art performance.\n","authors":["Ming-Kun Xie","Jia-Hao Xiao","Pei Peng","Gang Niu","Masashi Sugiyama","Sheng-Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2404.06287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06279v1","updated":"2024-04-09T13:02:33Z","published":"2024-04-09T13:02:33Z","title":"NoiseNCA: Noisy Seed Improves Spatio-Temporal Continuity of Neural\n Cellular Automata","summary":" Neural Cellular Automata (NCA) is a class of Cellular Automata where the\nupdate rule is parameterized by a neural network that can be trained using\ngradient descent. In this paper, we focus on NCA models used for texture\nsynthesis, where the update rule is inspired by partial differential equations\n(PDEs) describing reaction-diffusion systems. To train the NCA model, the\nspatio-termporal domain is discretized, and Euler integration is used to\nnumerically simulate the PDE. However, whether a trained NCA truly learns the\ncontinuous dynamic described by the corresponding PDE or merely overfits the\ndiscretization used in training remains an open question. We study NCA models\nat the limit where space-time discretization approaches continuity. We find\nthat existing NCA models tend to overfit the training discretization,\nespecially in the proximity of the initial condition, also called \"seed\". To\naddress this, we propose a solution that utilizes uniform noise as the initial\ncondition. We demonstrate the effectiveness of our approach in preserving the\nconsistency of NCA dynamics across a wide range of spatio-temporal\ngranularities. Our improved NCA model enables two new test-time interactions by\nallowing continuous control over the speed of pattern formation and the scale\nof the synthesized patterns. We demonstrate this new NCA feature in our\ninteractive online demo. Our work reveals that NCA models can learn continuous\ndynamics and opens new venues for NCA research from a dynamical systems'\nperspective.\n","authors":["Ehsan Pajouheshgar","Yitao Xu","Sabine Süsstrunk"],"pdf_url":"https://arxiv.org/pdf/2404.06279v1.pdf","comment":"9 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.06277v1","updated":"2024-04-09T13:01:26Z","published":"2024-04-09T13:01:26Z","title":"Learning Embeddings with Centroid Triplet Loss for Object Identification\n in Robotic Grasping","summary":" Foundation models are a strong trend in deep learning and computer vision.\nThese models serve as a base for applications as they require minor or no\nfurther fine-tuning by developers to integrate into their applications.\nFoundation models for zero-shot object segmentation such as Segment Anything\n(SAM) output segmentation masks from images without any further object\ninformation. When they are followed in a pipeline by an object identification\nmodel, they can perform object detection without training. Here, we focus on\ntraining such an object identification model. A crucial practical aspect for an\nobject identification model is to be flexible in input size. As object\nidentification is an image retrieval problem, a suitable method should handle\nmulti-query multi-gallery situations without constraining the number of input\nimages (e.g. by having fixed-size aggregation layers). The key solution to\ntrain such a model is the centroid triplet loss (CTL), which aggregates image\nfeatures to their centroids. CTL yields high accuracy, avoids misleading\ntraining signals and keeps the model input size flexible. In our experiments,\nwe establish a new state of the art on the ArmBench object identification task,\nwhich shows general applicability of our model. We furthermore demonstrate an\nintegrated unseen object detection pipeline on the challenging HOPE dataset,\nwhich requires fine-grained detection. There, our pipeline matches and\nsurpasses related methods which have been trained on dataset-specific data.\n","authors":["Anas Gouda","Max Schwarz","Christopher Reining","Sven Behnke","Alice Kirchheim"],"pdf_url":"https://arxiv.org/pdf/2404.06277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04561v2","updated":"2024-04-09T12:50:16Z","published":"2024-04-06T09:01:19Z","title":"Co-Occ: Coupling Explicit Feature Fusion with Volume Rendering\n Regularization for Multi-Modal 3D Semantic Occupancy Prediction","summary":" 3D semantic occupancy prediction is a pivotal task in the field of autonomous\ndriving. Recent approaches have made great advances in 3D semantic occupancy\npredictions on a single modality. However, multi-modal semantic occupancy\nprediction approaches have encountered difficulties in dealing with the\nmodality heterogeneity, modality misalignment, and insufficient modality\ninteractions that arise during the fusion of different modalities data, which\nmay result in the loss of important geometric and semantic information. This\nletter presents a novel multi-modal, i.e., LiDAR-camera 3D semantic occupancy\nprediction framework, dubbed Co-Occ, which couples explicit LiDAR-camera\nfeature fusion with implicit volume rendering regularization. The key insight\nis that volume rendering in the feature space can proficiently bridge the gap\nbetween 3D LiDAR sweeps and 2D images while serving as a physical\nregularization to enhance LiDAR-camera fused volumetric representation.\nSpecifically, we first propose a Geometric- and Semantic-aware Fusion\n(GSFusion) module to explicitly enhance LiDAR features by incorporating\nneighboring camera features through a K-nearest neighbors (KNN) search. Then,\nwe employ volume rendering to project the fused feature back to the image\nplanes for reconstructing color and depth maps. These maps are then supervised\nby input images from the camera and depth estimations derived from LiDAR,\nrespectively. Extensive experiments on the popular nuScenes and SemanticKITTI\nbenchmarks verify the effectiveness of our Co-Occ for 3D semantic occupancy\nprediction. The project page is available at\nhttps://rorisis.github.io/Co-Occ_project-page/.\n","authors":["Jingyi Pan","Zipeng Wang","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06273v1","updated":"2024-04-09T12:48:24Z","published":"2024-04-09T12:48:24Z","title":"Robust Confidence Intervals in Stereo Matching using Possibility Theory","summary":" We propose a method for estimating disparity confidence intervals in stereo\nmatching problems. Confidence intervals provide complementary information to\nusual confidence measures. To the best of our knowledge, this is the first\nmethod creating disparity confidence intervals based on the cost volume. This\nmethod relies on possibility distributions to interpret the epistemic\nuncertainty of the cost volume. Our method has the benefit of having a\nwhite-box nature, differing in this respect from current state-of-the-art deep\nneural networks approaches. The accuracy and size of confidence intervals are\nvalidated using the Middlebury stereo datasets as well as a dataset of\nsatellite images. This contribution is freely available on GitHub.\n","authors":["Roman Malinowski","Emmanuelle Sarrazin","Loïc Dumas","Emmanuel Dubois","Sébastien Destercke"],"pdf_url":"https://arxiv.org/pdf/2404.06273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06270v1","updated":"2024-04-09T12:47:30Z","published":"2024-04-09T12:47:30Z","title":"3D Geometry-aware Deformable Gaussian Splatting for Dynamic View\n Synthesis","summary":" In this paper, we propose a 3D geometry-aware deformable Gaussian Splatting\nmethod for dynamic view synthesis. Existing neural radiance fields (NeRF) based\nsolutions learn the deformation in an implicit manner, which cannot incorporate\n3D scene geometry. Therefore, the learned deformation is not necessarily\ngeometrically coherent, which results in unsatisfactory dynamic view synthesis\nand 3D dynamic reconstruction. Recently, 3D Gaussian Splatting provides a new\nrepresentation of the 3D scene, building upon which the 3D geometry could be\nexploited in learning the complex 3D deformation. Specifically, the scenes are\nrepresented as a collection of 3D Gaussian, where each 3D Gaussian is optimized\nto move and rotate over time to model the deformation. To enforce the 3D scene\ngeometry constraint during deformation, we explicitly extract 3D geometry\nfeatures and integrate them in learning the 3D deformation. In this way, our\nsolution achieves 3D geometry-aware deformation modeling, which enables\nimproved dynamic view synthesis and 3D dynamic reconstruction. Extensive\nexperimental results on both synthetic and real datasets prove the superiority\nof our solution, which achieves new state-of-the-art performance.\n The project is available at https://npucvr.github.io/GaGS/\n","authors":["Zhicheng Lu","Xiang Guo","Le Hui","Tianrui Chen","Min Yang","Xiao Tang","Feng Zhu","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2404.06270v1.pdf","comment":"Accepted by CVPR 2024. Project page: https://npucvr.github.io/GaGS/"},{"id":"http://arxiv.org/abs/2404.06265v1","updated":"2024-04-09T12:44:34Z","published":"2024-04-09T12:44:34Z","title":"Spatial-Temporal Multi-level Association for Video Object Segmentation","summary":" Existing semi-supervised video object segmentation methods either focus on\ntemporal feature matching or spatial-temporal feature modeling. However, they\ndo not address the issues of sufficient target interaction and efficient\nparallel processing simultaneously, thereby constraining the learning of\ndynamic, target-aware features. To tackle these limitations, this paper\nproposes a spatial-temporal multi-level association framework, which jointly\nassociates reference frame, test frame, and object features to achieve\nsufficient interaction and parallel target ID association with a\nspatial-temporal memory bank for efficient video object segmentation.\nSpecifically, we construct a spatial-temporal multi-level feature association\nmodule to learn better target-aware features, which formulates feature\nextraction and interaction as the efficient operations of object\nself-attention, reference object enhancement, and test reference correlation.\nIn addition, we propose a spatial-temporal memory to assist feature association\nand temporal ID assignment and correlation. We evaluate the proposed method by\nconducting extensive experiments on numerous video object segmentation\ndatasets, including DAVIS 2016/2017 val, DAVIS 2017 test-dev, and YouTube-VOS\n2018/2019 val. The favorable performance against the state-of-the-art methods\ndemonstrates the effectiveness of our approach. All source code and trained\nmodels will be made publicly available.\n","authors":["Deshui Miao","Xin Li","Zhenyu He","Huchuan Lu","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.06265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07166v2","updated":"2024-04-09T12:40:18Z","published":"2023-10-11T03:29:13Z","title":"Anchor-based Multi-view Subspace Clustering with Hierarchical Feature\n Descent","summary":" Multi-view clustering has attracted growing attention owing to its\ncapabilities of aggregating information from various sources and its promising\nhorizons in public affairs. Up till now, many advanced approaches have been\nproposed in recent literature. However, there are several ongoing difficulties\nto be tackled. One common dilemma occurs while attempting to align the features\nof different views. {Moreover, due to the fact that many existing multi-view\nclustering algorithms stem from spectral clustering, this results to cubic time\ncomplexity w.r.t. the number of dataset. However, we propose Anchor-based\nMulti-view Subspace Clustering with Hierarchical Feature Descent(MVSC-HFD) to\ntackle the discrepancy among views through hierarchical feature descent and\nproject to a common subspace( STAGE 1), which reveals dependency of different\nviews. We further reduce the computational complexity to linear time cost\nthrough a unified sampling strategy in the common subspace( STAGE 2), followed\nby anchor-based subspace clustering to learn the bipartite graph collectively(\nSTAGE 3). }Extensive experimental results on public benchmark datasets\ndemonstrate that our proposed model consistently outperforms the\nstate-of-the-art techniques.\n","authors":["Qiyuan Ou","Siwei Wang","Pei Zhang","Sihang Zhou","En Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.07166v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06261v1","updated":"2024-04-09T12:34:28Z","published":"2024-04-09T12:34:28Z","title":"Playing to Vision Foundation Model's Strengths in Stereo Matching","summary":" Stereo matching has become a key technique for 3D environment perception in\nintelligent vehicles. For a considerable time, convolutional neural networks\n(CNNs) have remained the mainstream choice for feature extraction in this\ndomain. Nonetheless, there is a growing consensus that the existing paradigm\nshould evolve towards vision foundation models (VFM), particularly those\ndeveloped based on vision Transformers (ViTs) and pre-trained through\nself-supervision on extensive, unlabeled datasets. While VFMs are adept at\nextracting informative, general-purpose visual features, specifically for dense\nprediction tasks, their performance often lacks in geometric vision tasks. This\nstudy serves as the first exploration of a viable approach for adapting VFMs to\nstereo matching. Our ViT adapter, referred to as ViTAS, is constructed upon\nthree types of modules: spatial differentiation, patch attention fusion, and\ncross-attention. The first module initializes feature pyramids, while the\nlatter two aggregate stereo and multi-scale contextual information into\nfine-grained features, respectively. ViTAStereo, which combines ViTAS with cost\nvolume-based stereo matching back-end processes, achieves the top rank on the\nKITTI Stereo 2012 dataset and outperforms the second-best network StereoBase by\napproximately 7.9% in terms of the percentage of error pixels, with a tolerance\nof 3 pixels. Additional experiments across diverse scenarios further\ndemonstrate its superior generalizability compared to all other\nstate-of-the-art approaches. We believe this new paradigm will pave the way for\nthe next generation of stereo matching networks.\n","authors":["Chuang-Wei Liu","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2404.06261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06258v1","updated":"2024-04-09T12:32:10Z","published":"2024-04-09T12:32:10Z","title":"Robust feature knowledge distillation for enhanced performance of\n lightweight crack segmentation models","summary":" Vision-based crack detection faces deployment challenges due to the size of\nrobust models and edge device limitations. These can be addressed with\nlightweight models trained with knowledge distillation (KD). However,\nstate-of-the-art (SOTA) KD methods compromise anti-noise robustness. This paper\ndevelops Robust Feature Knowledge Distillation (RFKD), a framework to improve\nrobustness while retaining the precision of light models for crack\nsegmentation. RFKD distils knowledge from a teacher model's logit layers and\nintermediate feature maps while leveraging mixed clean and noisy images to\ntransfer robust patterns to the student model, improving its precision,\ngeneralisation, and anti-noise performance. To validate the proposed RFKD, a\nlightweight crack segmentation model, PoolingCrack Tiny (PCT), with only 0.5 M\nparameters, is also designed and used as the student to run the framework. The\nresults show a significant enhancement in noisy images, with RFKD reaching a\n62% enhanced mean Dice score (mDS) compared to SOTA KD methods.\n","authors":["Zhaohui Chen","Elyas Asadi Shamsabadi","Sheng Jiang","Luming Shen","Daniel Dias-da-Costa"],"pdf_url":"https://arxiv.org/pdf/2404.06258v1.pdf","comment":"24 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.06256v1","updated":"2024-04-09T12:29:16Z","published":"2024-04-09T12:29:16Z","title":"Label-Efficient 3D Object Detection For Road-Side Units","summary":" Occlusion presents a significant challenge for safety-critical applications\nsuch as autonomous driving. Collaborative perception has recently attracted a\nlarge research interest thanks to the ability to enhance the perception of\nautonomous vehicles via deep information fusion with intelligent roadside units\n(RSU), thus minimizing the impact of occlusion. While significant advancement\nhas been made, the data-hungry nature of these methods creates a major hurdle\nfor their real-world deployment, particularly due to the need for annotated RSU\ndata. Manually annotating the vast amount of RSU data required for training is\nprohibitively expensive, given the sheer number of intersections and the effort\ninvolved in annotating point clouds. We address this challenge by devising a\nlabel-efficient object detection method for RSU based on unsupervised object\ndiscovery. Our paper introduces two new modules: one for object discovery based\non a spatial-temporal aggregation of point clouds, and another for refinement.\nFurthermore, we demonstrate that fine-tuning on a small portion of annotated\ndata allows our object discovery models to narrow the performance gap with, or\neven surpass, fully supervised models. Extensive experiments are carried out in\nsimulated and real-world datasets to evaluate our method.\n","authors":["Minh-Quan Dao","Holger Caesar","Julie Stephany Berrio","Mao Shan","Stewart Worrall","Vincent Frémont","Ezio Malis"],"pdf_url":"https://arxiv.org/pdf/2404.06256v1.pdf","comment":"IV 2024"},{"id":"http://arxiv.org/abs/2404.06253v1","updated":"2024-04-09T12:25:06Z","published":"2024-04-09T12:25:06Z","title":"From Barlow Twins to Triplet Training: Differentiating Dementia with\n Limited Data","summary":" Differential diagnosis of dementia is challenging due to overlapping\nsymptoms, with structural magnetic resonance imaging (MRI) being the primary\nmethod for diagnosis. Despite the clinical value of computer-aided differential\ndiagnosis, research has been limited, mainly due to the absence of public\ndatasets that contain diverse types of dementia. This leaves researchers with\nsmall in-house datasets that are insufficient for training deep neural networks\n(DNNs). Self-supervised learning shows promise for utilizing unlabeled MRI\nscans in training, but small batch sizes for volumetric brain scans make its\napplication challenging. To address these issues, we propose Triplet Training\nfor differential diagnosis with limited target data. It consists of three key\nstages: (i) self-supervised pre-training on unlabeled data with Barlow Twins,\n(ii) self-distillation on task-related data, and (iii) fine-tuning on the\ntarget dataset. Our approach significantly outperforms traditional training\nstrategies, achieving a balanced accuracy of 75.6%. We further provide insights\ninto the training process by visualizing changes in the latent space after each\nstep. Finally, we validate the robustness of Triplet Training in terms of its\nindividual components in a comprehensive ablation study. Our code is available\nat https://github.com/ai-med/TripletTraining.\n","authors":["Yitong Li","Tom Nuno Wolf","Sebastian Pölsterl","Igor Yakushev","Dennis M. Hedderich","Christian Wachinger"],"pdf_url":"https://arxiv.org/pdf/2404.06253v1.pdf","comment":"Accepted for presentation at MIDL 2024"},{"id":"http://arxiv.org/abs/2404.06251v1","updated":"2024-04-09T12:23:30Z","published":"2024-04-09T12:23:30Z","title":"ColorMNet: A Memory-based Deep Spatial-Temporal Feature Propagation\n Network for Video Colorization","summary":" How to effectively explore spatial-temporal features is important for video\ncolorization. Instead of stacking multiple frames along the temporal dimension\nor recurrently propagating estimated features that will accumulate errors or\ncannot explore information from far-apart frames, we develop a memory-based\nfeature propagation module that can establish reliable connections with\nfeatures from far-apart frames and alleviate the influence of inaccurately\nestimated features. To extract better features from each frame for the\nabove-mentioned feature propagation, we explore the features from\nlarge-pretrained visual models to guide the feature estimation of each frame so\nthat the estimated features can model complex scenarios. In addition, we note\nthat adjacent frames usually contain similar contents. To explore this property\nfor better spatial and temporal feature utilization, we develop a local\nattention module to aggregate the features from adjacent frames in a\nspatial-temporal neighborhood. We formulate our memory-based feature\npropagation module, large-pretrained visual model guided feature estimation\nmodule, and local attention module into an end-to-end trainable network (named\nColorMNet) and show that it performs favorably against state-of-the-art methods\non both the benchmark datasets and real-world scenarios. The source code and\npre-trained models will be available at\n\\url{https://github.com/yyang181/colormnet}.\n","authors":["Yixin Yang","Jiangxin Dong","Jinhui Tang","Jinshan Pan"],"pdf_url":"https://arxiv.org/pdf/2404.06251v1.pdf","comment":"Project website: \\url{https://github.com/yyang181/colormnet}"},{"id":"http://arxiv.org/abs/2404.06247v1","updated":"2024-04-09T12:13:40Z","published":"2024-04-09T12:13:40Z","title":"LRR: Language-Driven Resamplable Continuous Representation against\n Adversarial Tracking Attacks","summary":" Visual object tracking plays a critical role in visual-based autonomous\nsystems, as it aims to estimate the position and size of the object of interest\nwithin a live video. Despite significant progress made in this field,\nstate-of-the-art (SOTA) trackers often fail when faced with adversarial\nperturbations in the incoming frames. This can lead to significant robustness\nand security issues when these trackers are deployed in the real world. To\nachieve high accuracy on both clean and adversarial data, we propose building a\nspatial-temporal continuous representation using the semantic text guidance of\nthe object of interest. This novel continuous representation enables us to\nreconstruct incoming frames to maintain semantic and appearance consistency\nwith the object of interest and its clean counterparts. As a result, our\nproposed method successfully defends against different SOTA adversarial\ntracking attacks while maintaining high accuracy on clean data. In particular,\nour method significantly increases tracking accuracy under adversarial attacks\nwith around 90% relative improvement on UAV123, which is even higher than the\naccuracy on clean data.\n","authors":["Jianlang Chen","Xuhong Ren","Qing Guo","Felix Juefei-Xu","Di Lin","Wei Feng","Lei Ma","Jianjun Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.06247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06246v1","updated":"2024-04-09T12:11:25Z","published":"2024-04-09T12:11:25Z","title":"GHNeRF: Learning Generalizable Human Features with Efficient Neural\n Radiance Fields","summary":" Recent advances in Neural Radiance Fields (NeRF) have demonstrated promising\nresults in 3D scene representations, including 3D human representations.\nHowever, these representations often lack crucial information on the underlying\nhuman pose and structure, which is crucial for AR/VR applications and games. In\nthis paper, we introduce a novel approach, termed GHNeRF, designed to address\nthese limitations by learning 2D/3D joint locations of human subjects with NeRF\nrepresentation. GHNeRF uses a pre-trained 2D encoder streamlined to extract\nessential human features from 2D images, which are then incorporated into the\nNeRF framework in order to encode human biomechanic features. This allows our\nnetwork to simultaneously learn biomechanic features, such as joint locations,\nalong with human geometry and texture. To assess the effectiveness of our\nmethod, we conduct a comprehensive comparison with state-of-the-art human NeRF\ntechniques and joint estimation algorithms. Our results show that GHNeRF can\nachieve state-of-the-art results in near real-time.\n","authors":["Arnab Dey","Di Yang","Rohith Agaram","Antitza Dantcheva","Andrew I. Comport","Srinath Sridhar","Jean Martinet"],"pdf_url":"https://arxiv.org/pdf/2404.06246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06244v1","updated":"2024-04-09T12:10:54Z","published":"2024-04-09T12:10:54Z","title":"Anchor-based Robust Finetuning of Vision-Language Models","summary":" We aim at finetuning a vision-language model without hurting its\nout-of-distribution (OOD) generalization. We address two types of OOD\ngeneralization, i.e., i) domain shift such as natural to sketch images, and ii)\nzero-shot capability to recognize the category that was not contained in the\nfinetune data. Arguably, the diminished OOD generalization after finetuning\nstems from the excessively simplified finetuning target, which only provides\nthe class information, such as ``a photo of a [CLASS]''. This is distinct from\nthe process in that CLIP was pretrained, where there is abundant text\nsupervision with rich semantic information. Therefore, we propose to compensate\nfor the finetune process using auxiliary supervision with rich semantic\ninformation, which acts as anchors to preserve the OOD generalization.\nSpecifically, two types of anchors are elaborated in our method, including i)\ntext-compensated anchor which uses the images from the finetune set but\nenriches the text supervision from a pretrained captioner, ii) image-text-pair\nanchor which is retrieved from the dataset similar to pretraining data of CLIP\naccording to the downstream task, associating with the original CLIP text with\nrich semantics. Those anchors are utilized as auxiliary semantic information to\nmaintain the original feature space of CLIP, thereby preserving the OOD\ngeneralization capabilities. Comprehensive experiments demonstrate that our\nmethod achieves in-distribution performance akin to conventional finetuning\nwhile attaining new state-of-the-art results on domain shift and zero-shot\nlearning benchmarks.\n","authors":["Jinwei Han","Zhiwen Lin","Zhongyisun Sun","Yingguo Gao","Ke Yan","Shouhong Ding","Yuan Gao","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2404.06244v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.06243v1","updated":"2024-04-09T12:09:56Z","published":"2024-04-09T12:09:56Z","title":"ActNetFormer: Transformer-ResNet Hybrid Method for Semi-Supervised\n Action Recognition in Videos","summary":" Human action or activity recognition in videos is a fundamental task in\ncomputer vision with applications in surveillance and monitoring, self-driving\ncars, sports analytics, human-robot interaction and many more. Traditional\nsupervised methods require large annotated datasets for training, which are\nexpensive and time-consuming to acquire. This work proposes a novel approach\nusing Cross-Architecture Pseudo-Labeling with contrastive learning for\nsemi-supervised action recognition. Our framework leverages both labeled and\nunlabelled data to robustly learn action representations in videos, combining\npseudo-labeling with contrastive learning for effective learning from both\ntypes of samples. We introduce a novel cross-architecture approach where 3D\nConvolutional Neural Networks (3D CNNs) and video transformers (VIT) are\nutilised to capture different aspects of action representations; hence we call\nit ActNetFormer. The 3D CNNs excel at capturing spatial features and local\ndependencies in the temporal domain, while VIT excels at capturing long-range\ndependencies across frames. By integrating these complementary architectures\nwithin the ActNetFormer framework, our approach can effectively capture both\nlocal and global contextual information of an action. This comprehensive\nrepresentation learning enables the model to achieve better performance in\nsemi-supervised action recognition tasks by leveraging the strengths of each of\nthese architectures. Experimental results on standard action recognition\ndatasets demonstrate that our approach performs better than the existing\nmethods, achieving state-of-the-art performance with only a fraction of labeled\ndata. The official website of this work is available at:\nhttps://github.com/rana2149/ActNetFormer.\n","authors":["Sharana Dharshikgan Suresh Dass","Hrishav Bakul Barua","Ganesh Krishnasamy","Raveendran Paramesran","Raphael C. -W. Phan"],"pdf_url":"https://arxiv.org/pdf/2404.06243v1.pdf","comment":"Submitted for peer review"},{"id":"http://arxiv.org/abs/2404.06240v1","updated":"2024-04-09T12:06:21Z","published":"2024-04-09T12:06:21Z","title":"Hyperparameter-Free Medical Image Synthesis for Sharing Data and\n Improving Site-Specific Segmentation","summary":" Sharing synthetic medical images is a promising alternative to sharing real\nimages that can improve patient privacy and data security. To get good results,\nexisting methods for medical image synthesis must be manually adjusted when\nthey are applied to unseen data. To remove this manual burden, we introduce a\nHyperparameter-Free distributed learning method for automatic medical image\nSynthesis, Sharing, and Segmentation called HyFree-S3. For three diverse\nsegmentation settings (pelvic MRIs, lung X-rays, polyp photos), the use of\nHyFree-S3 results in improved performance over training only with site-specific\ndata (in the majority of cases). The hyperparameter-free nature of the method\nshould make data synthesis and sharing easier, potentially leading to an\nincrease in the quantity of available data and consequently the quality of the\nmodels trained that may ultimately be applied in the clinic. Our code is\navailable at https://github.com/AwesomeLemon/HyFree-S3\n","authors":["Alexander Chebykin","Peter A. N. Bosman","Tanja Alderliesten"],"pdf_url":"https://arxiv.org/pdf/2404.06240v1.pdf","comment":"Accepted at MIDL 2024"},{"id":"http://arxiv.org/abs/2311.18649v3","updated":"2024-04-09T11:55:20Z","published":"2023-11-30T15:57:34Z","title":"Simple Semantic-Aided Few-Shot Learning","summary":" Learning from a limited amount of data, namely Few-Shot Learning, stands out\nas a challenging computer vision task. Several works exploit semantics and\ndesign complicated semantic fusion mechanisms to compensate for rare\nrepresentative features within restricted data. However, relying on naive\nsemantics such as class names introduces biases due to their brevity, while\nacquiring extensive semantics from external knowledge takes a huge time and\neffort. This limitation severely constrains the potential of semantics in\nFew-Shot Learning. In this paper, we design an automatic way called Semantic\nEvolution to generate high-quality semantics. The incorporation of high-quality\nsemantics alleviates the need for complex network structures and learning\nalgorithms used in previous works. Hence, we employ a simple two-layer network\ntermed Semantic Alignment Network to transform semantics and visual features\ninto robust class prototypes with rich discriminative features for few-shot\nclassification. The experimental results show our framework outperforms all\nprevious methods on six benchmarks, demonstrating a simple network with\nhigh-quality semantics can beat intricate multi-modal modules on few-shot\nclassification tasks. Code is available at\nhttps://github.com/zhangdoudou123/SemFew.\n","authors":["Hai Zhang","Junzhe Xu","Shanlin Jiang","Zhenan He"],"pdf_url":"https://arxiv.org/pdf/2311.18649v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2307.10974v3","updated":"2024-04-09T11:23:10Z","published":"2023-07-20T16:00:19Z","title":"Deep Multi-Threshold Spiking-UNet for Image Processing","summary":" U-Net, known for its simple yet efficient architecture, is widely utilized\nfor image processing tasks and is particularly suitable for deployment on\nneuromorphic chips. This paper introduces the novel concept of Spiking-UNet for\nimage processing, which combines the power of Spiking Neural Networks (SNNs)\nwith the U-Net architecture. To achieve an efficient Spiking-UNet, we face two\nprimary challenges: ensuring high-fidelity information propagation through the\nnetwork via spikes and formulating an effective training strategy. To address\nthe issue of information loss, we introduce multi-threshold spiking neurons,\nwhich improve the efficiency of information transmission within the\nSpiking-UNet. For the training strategy, we adopt a conversion and fine-tuning\npipeline that leverage pre-trained U-Net models. During the conversion process,\nsignificant variability in data distribution across different parts is observed\nwhen utilizing skip connections. Therefore, we propose a connection-wise\nnormalization method to prevent inaccurate firing rates. Furthermore, we adopt\na flow-based training method to fine-tune the converted models, reducing time\nsteps while preserving performance. Experimental results show that, on image\nsegmentation and denoising, our Spiking-UNet achieves comparable performance to\nits non-spiking counterpart, surpassing existing SNN methods. Compared with the\nconverted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference\ntime by approximately 90\\%. This research broadens the application scope of\nSNNs in image processing and is expected to inspire further exploration in the\nfield of neuromorphic engineering. The code for our Spiking-UNet implementation\nis available at https://github.com/SNNresearch/Spiking-UNet.\n","authors":["Hebei Li","Yueyi Zhang","Zhiwei Xiong","Zheng-jun Zha","Xiaoyan Sun"],"pdf_url":"https://arxiv.org/pdf/2307.10974v3.pdf","comment":"Accepted in NeuroComputing"},{"id":"http://arxiv.org/abs/2404.06219v1","updated":"2024-04-09T11:13:36Z","published":"2024-04-09T11:13:36Z","title":"Automatic Defect Detection in Sewer Network Using Deep Learning Based\n Object Detector","summary":" Maintaining sewer systems in large cities is important, but also time and\neffort consuming, because visual inspections are currently done manually. To\nreduce the amount of aforementioned manual work, defects within sewer pipes\nshould be located and classified automatically. In the past, multiple works\nhave attempted solving this problem using classical image processing, machine\nlearning, or a combination of those. However, each provided solution only focus\non detecting a limited set of defect/structure types, such as fissure, root,\nand/or connection. Furthermore, due to the use of hand-crafted features and\nsmall training datasets, generalization is also problematic. In order to\novercome these deficits, a sizable dataset with 14.7 km of various sewer pipes\nwere annotated by sewer maintenance experts in the scope of this work. On top\nof that, an object detector (EfficientDet-D0) was trained for automatic defect\ndetection. From the result of several expermients, peculiar natures of defects\nin the context of object detection, which greatly effect annotation and\ntraining process, are found and discussed. At the end, the final detector was\nable to detect 83% of defects in the test set; out of the missing 17%, only\n0.77% are very severe defects. This work provides an example of applying deep\nlearning-based object detection into an important but quiet engineering field.\nIt also gives some practical pointers on how to annotate peculiar \"object\",\nsuch as defects.\n","authors":["Bach Ha","Birgit Schalter","Laura White","Joachim Koehler"],"pdf_url":"https://arxiv.org/pdf/2404.06219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06212v1","updated":"2024-04-09T11:00:19Z","published":"2024-04-09T11:00:19Z","title":"OmniFusion Technical Report","summary":" Last year, multimodal architectures served up a revolution in AI-based\napproaches and solutions, extending the capabilities of large language models\n(LLM). We propose an \\textit{OmniFusion} model based on a pretrained LLM and\nadapters for visual modality. We evaluated and compared several architecture\ndesign principles for better text and visual data coupling: MLP and transformer\nadapters, various CLIP ViT-based encoders (SigLIP, InternVIT, etc.), and their\nfusing approach, image encoding method (whole image or tiles encoding) and two\n7B LLMs (the proprietary one and open-source Mistral). Experiments on 8\nvisual-language benchmarks show the top score for the best OmniFusion setup in\nterms of different VQA tasks in comparison with open-source LLaVA-like\nsolutions: VizWiz, Pope, MM-Vet, ScienceQA, MMBench, TextVQA, VQAv2, MMMU. We\nalso propose a variety of situations, where OmniFusion provides highly-detailed\nanswers in different domains: housekeeping, sightseeing, culture, medicine,\nhandwritten and scanned equations recognition, etc. Mistral-based OmniFusion\nmodel is an open-source solution with weights, training and inference scripts\navailable at https://github.com/AIRI-Institute/OmniFusion.\n","authors":["Elizaveta Goncharova","Anton Razzhigaev","Matvey Mikhalchuk","Maxim Kurkin","Irina Abdullaeva","Matvey Skripkin","Ivan Oseledets","Denis Dimitrov","Andrey Kuznetsov"],"pdf_url":"https://arxiv.org/pdf/2404.06212v1.pdf","comment":"17 pages, 4 figures, 9 tables, 2 appendices"},{"id":"http://arxiv.org/abs/2404.06211v1","updated":"2024-04-09T11:00:11Z","published":"2024-04-09T11:00:11Z","title":"Unified Physical-Digital Attack Detection Challenge","summary":" Face Anti-Spoofing (FAS) is crucial to safeguard Face Recognition (FR)\nSystems. In real-world scenarios, FRs are confronted with both physical and\ndigital attacks. However, existing algorithms often address only one type of\nattack at a time, which poses significant limitations in real-world scenarios\nwhere FR systems face hybrid physical-digital threats. To facilitate the\nresearch of Unified Attack Detection (UAD) algorithms, a large-scale\nUniAttackData dataset has been collected. UniAttackData is the largest public\ndataset for Unified Attack Detection, with a total of 28,706 videos, where each\nunique identity encompasses all advanced attack types. Based on this dataset,\nwe organized a Unified Physical-Digital Face Attack Detection Challenge to\nboost the research in Unified Attack Detections. It attracted 136 teams for the\ndevelopment phase, with 13 qualifying for the final round. The results\nre-verified by the organizing team were used for the final ranking. This paper\ncomprehensively reviews the challenge, detailing the dataset introduction,\nprotocol definition, evaluation criteria, and a summary of published results.\nFinally, we focus on the detailed analysis of the highest-performing algorithms\nand offer potential directions for unified physical-digital attack detection\ninspired by this competition. Challenge Website:\nhttps://sites.google.com/view/face-anti-spoofing-challenge/welcome/challengecvpr2024.\n","authors":["Haocheng Yuan","Ajian Liu","Junze Zheng","Jun Wan","Jiankang Deng","Sergio Escalera","Hugo Jair Escalante","Isabelle Guyon","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2404.06211v1.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.06207v1","updated":"2024-04-09T10:56:46Z","published":"2024-04-09T10:56:46Z","title":"Leveraging edge detection and neural networks for better UAV\n localization","summary":" We propose a novel method for geolocalizing Unmanned Aerial Vehicles (UAVs)\nin environments lacking Global Navigation Satellite Systems (GNSS). Current\nstate-of-the-art techniques employ an offline-trained encoder to generate a\nvector representation (embedding) of the UAV's current view, which is then\ncompared with pre-computed embeddings of geo-referenced images to determine the\nUAV's position. Here, we demonstrate that the performance of these methods can\nbe significantly enhanced by preprocessing the images to extract their edges,\nwhich exhibit robustness to seasonal and illumination variations. Furthermore,\nwe establish that utilizing edges enhances resilience to orientation and\naltitude inaccuracies. Additionally, we introduce a confidence criterion for\nlocalization. Our findings are substantiated through synthetic experiments.\n","authors":["Theo Di Piazza","Enric Meinhardt-Llopis","Gabriele Facciolo","Benedicte Bascle","Corentin Abgrall","Jean-Clement Devaux"],"pdf_url":"https://arxiv.org/pdf/2404.06207v1.pdf","comment":"Accepted for publication in IGARSS2024. 4 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.06202v1","updated":"2024-04-09T10:47:43Z","published":"2024-04-09T10:47:43Z","title":"Automated National Urban Map Extraction","summary":" Developing countries usually lack the proper governance means to generate and\nregularly update a national rooftop map. Using traditional photogrammetry and\nsurveying methods to produce a building map at the federal level is costly and\ntime consuming. Using earth observation and deep learning methods, we can\nbridge this gap and propose an automated pipeline to fetch such national urban\nmaps. This paper aims to exploit the power of fully convolutional neural\nnetworks for multi-class buildings' instance segmentation to leverage high\nobject-wise accuracy results. Buildings' instance segmentation from sub-meter\nhigh-resolution satellite images can be achieved with relatively high\npixel-wise metric scores. We detail all engineering steps to replicate this\nwork and ensure highly accurate results in dense and slum areas witnessed in\nregions that lack proper urban planning in the Global South. We applied a case\nstudy of the proposed pipeline to Lebanon and successfully produced the first\ncomprehensive national building footprint map with approximately 1 Million\nunits with an 84% accuracy. The proposed architecture relies on advanced\naugmentation techniques to overcome dataset scarcity, which is often the case\nin developing countries.\n","authors":["Hasan Nasrallah","Abed Ellatif Samhat","Cristiano Nattero","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2404.06202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06194v1","updated":"2024-04-09T10:27:22Z","published":"2024-04-09T10:27:22Z","title":"Exploring the Potential of Large Foundation Models for Open-Vocabulary\n HOI Detection","summary":" Open-vocabulary human-object interaction (HOI) detection, which is concerned\nwith the problem of detecting novel HOIs guided by natural language, is crucial\nfor understanding human-centric scenes. However, prior zero-shot HOI detectors\noften employ the same levels of feature maps to model HOIs with varying\ndistances, leading to suboptimal performance in scenes containing human-object\npairs with a wide range of distances. In addition, these detectors primarily\nrely on category names and overlook the rich contextual information that\nlanguage can provide, which is essential for capturing open vocabulary concepts\nthat are typically rare and not well-represented by category names alone. In\nthis paper, we introduce a novel end-to-end open vocabulary HOI detection\nframework with conditional multi-level decoding and fine-grained semantic\nenhancement (CMD-SE), harnessing the potential of Visual-Language Models\n(VLMs). Specifically, we propose to model human-object pairs with different\ndistances with different levels of feature maps by incorporating a soft\nconstraint during the bipartite matching process. Furthermore, by leveraging\nlarge language models (LLMs) such as GPT models, we exploit their extensive\nworld knowledge to generate descriptions of human body part states for various\ninteractions. Then we integrate the generalizable and fine-grained semantics of\nhuman body parts to improve interaction recognition. Experimental results on\ntwo datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method\nachieves state-of-the-art results in open vocabulary HOI detection. The code\nand models are available at https://github.com/ltttpku/CMD-SE-release.\n","authors":["Ting Lei","Shaofeng Yin","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06181v1","updated":"2024-04-09T10:04:06Z","published":"2024-04-09T10:04:06Z","title":"EPL: Evidential Prototype Learning for Semi-supervised Medical Image\n Segmentation","summary":" Although current semi-supervised medical segmentation methods can achieve\ndecent performance, they are still affected by the uncertainty in unlabeled\ndata and model predictions, and there is currently a lack of effective\nstrategies that can explore the uncertain aspects of both simultaneously. To\naddress the aforementioned issues, we propose Evidential Prototype Learning\n(EPL), which utilizes an extended probabilistic framework to effectively fuse\nvoxel probability predictions from different sources and achieves prototype\nfusion utilization of labeled and unlabeled data under a generalized evidential\nframework, leveraging voxel-level dual uncertainty masking. The uncertainty not\nonly enables the model to self-correct predictions but also improves the guided\nlearning process with pseudo-labels and is able to feed back into the\nconstruction of hidden features. The method proposed in this paper has been\nexperimented on LA, Pancreas-CT and TBAD datasets, achieving the\nstate-of-the-art performance in three different labeled ratios, which strongly\ndemonstrates the effectiveness of our strategy.\n","authors":["Yuanpeng He"],"pdf_url":"https://arxiv.org/pdf/2404.06181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06180v1","updated":"2024-04-09T10:03:44Z","published":"2024-04-09T10:03:44Z","title":"YOLC: You Only Look Clusters for Tiny Object Detection in Aerial Images","summary":" Detecting objects from aerial images poses significant challenges due to the\nfollowing factors: 1) Aerial images typically have very large sizes, generally\nwith millions or even hundreds of millions of pixels, while computational\nresources are limited. 2) Small object size leads to insufficient information\nfor effective detection. 3) Non-uniform object distribution leads to\ncomputational resource wastage. To address these issues, we propose YOLC (You\nOnly Look Clusters), an efficient and effective framework that builds on an\nanchor-free object detector, CenterNet. To overcome the challenges posed by\nlarge-scale images and non-uniform object distribution, we introduce a Local\nScale Module (LSM) that adaptively searches cluster regions for zooming in for\naccurate detection. Additionally, we modify the regression loss using Gaussian\nWasserstein distance (GWD) to obtain high-quality bounding boxes. Deformable\nconvolution and refinement methods are employed in the detection head to\nenhance the detection of small objects. We perform extensive experiments on two\naerial image datasets, including Visdrone2019 and UAVDT, to demonstrate the\neffectiveness and superiority of our proposed approach.\n","authors":["Chenguang Liu","Guangshuai Gao","Ziyue Huang","Zhenghui Hu","Qingjie Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06180v1.pdf","comment":"accepted to TITS"},{"id":"http://arxiv.org/abs/2404.06177v1","updated":"2024-04-09T09:58:10Z","published":"2024-04-09T09:58:10Z","title":"Uncertainty-aware Evidential Fusion-based Learning for Semi-supervised\n Medical Image Segmentation","summary":" Although the existing uncertainty-based semi-supervised medical segmentation\nmethods have achieved excellent performance, they usually only consider a\nsingle uncertainty evaluation, which often fails to solve the problem related\nto credibility completely. Therefore, based on the framework of evidential deep\nlearning, this paper integrates the evidential predictive results in the\ncross-region of mixed and original samples to reallocate the confidence degree\nand uncertainty measure of each voxel, which is realized by emphasizing\nuncertain information of probability assignments fusion rule of traditional\nevidence theory. Furthermore, we design a voxel-level asymptotic learning\nstrategy by introducing information entropy to combine with the fused\nuncertainty measure to estimate voxel prediction more precisely. The model will\ngradually pay attention to the prediction results with high uncertainty in the\nlearning process, to learn the features that are difficult to master. The\nexperimental results on LA, Pancreas-CT, ACDC and TBAD datasets demonstrate the\nsuperior performance of our proposed method in comparison with the existing\nstate of the arts.\n","authors":["Yuanpeng He","Lijian Li"],"pdf_url":"https://arxiv.org/pdf/2404.06177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06173v1","updated":"2024-04-09T09:54:21Z","published":"2024-04-09T09:54:21Z","title":"Improving Interpretable Embeddings for Ad-hoc Video Search with\n Generative Captions and Multi-word Concept Bank","summary":" Aligning a user query and video clips in cross-modal latent space and that\nwith semantic concepts are two mainstream approaches for ad-hoc video search\n(AVS). However, the effectiveness of existing approaches is bottlenecked by the\nsmall sizes of available video-text datasets and the low quality of concept\nbanks, which results in the failures of unseen queries and the\nout-of-vocabulary problem. This paper addresses these two problems by\nconstructing a new dataset and developing a multi-word concept bank.\nSpecifically, capitalizing on a generative model, we construct a new dataset\nconsisting of 7 million generated text and video pairs for pre-training. To\ntackle the out-of-vocabulary problem, we develop a multi-word concept bank\nbased on syntax analysis to enhance the capability of a state-of-the-art\ninterpretable AVS method in modeling relationships between query words. We also\nstudy the impact of current advanced features on the method. Experimental\nresults show that the integration of the above-proposed elements doubles the\nR@1 performance of the AVS method on the MSRVTT dataset and improves the xinfAP\non the TRECVid AVS query sets for 2016-2023 (eight years) by a margin from 2%\nto 77%, with an average about 20%.\n","authors":["Jiaxin Wu","Chong-Wah Ngo","Wing-Kwong Chan"],"pdf_url":"https://arxiv.org/pdf/2404.06173v1.pdf","comment":"Accepted in ICMR2024"},{"id":"http://arxiv.org/abs/2403.10376v2","updated":"2024-04-09T09:52:54Z","published":"2024-03-15T15:05:29Z","title":"PASTA: Towards Flexible and Efficient HDR Imaging Via Progressively\n Aggregated Spatio-Temporal Alignment","summary":" Leveraging Transformer attention has led to great advancements in HDR\ndeghosting. However, the intricate nature of self-attention introduces\npractical challenges, as existing state-of-the-art methods often demand\nhigh-end GPUs or exhibit slow inference speeds, especially for high-resolution\nimages like 2K. Striking an optimal balance between performance and latency\nremains a critical concern. In response, this work presents PASTA, a novel\nProgressively Aggregated Spatio-Temporal Alignment framework for HDR\ndeghosting. Our approach achieves effectiveness and efficiency by harnessing\nhierarchical representation during feature distanglement. Through the\nutilization of diverse granularities within the hierarchical structure, our\nmethod substantially boosts computational speed and optimizes the HDR imaging\nworkflow. In addition, we explore within-scale feature modeling with local and\nglobal attention, gradually merging and refining them in a coarse-to-fine\nfashion. Experimental results showcase PASTA's superiority over current SOTA\nmethods in both visual quality and performance metrics, accompanied by a\nsubstantial 3-fold (x3) increase in inference speed.\n","authors":["Xiaoning Liu","Ao Li","Zongwei Wu","Yapeng Du","Le Zhang","Yulun Zhang","Radu Timofte","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.10376v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05393v2","updated":"2024-04-09T09:52:32Z","published":"2024-04-08T10:52:29Z","title":"PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation","summary":" Beyond class frequency, we recognize the impact of class-wise relationships\namong various class-specific predictions and the imbalance in label masks on\nlong-tailed segmentation learning. To address these challenges, we propose an\ninnovative Pixel-wise Adaptive Training (PAT) technique tailored for\nlong-tailed segmentation. PAT has two key features: 1) class-wise gradient\nmagnitude homogenization, and 2) pixel-wise class-specific loss adaptation\n(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate\nthe imbalance among label masks by ensuring equal consideration of the\nclass-wise impact on model updates. Second, PCLA tackles the detrimental impact\nof both rare classes within the long-tailed distribution and inaccurate\npredictions from previous training stages by encouraging learning classes with\nlow prediction confidence and guarding against forgetting classes with high\nconfidence. This combined approach fosters robust learning while preventing the\nmodel from forgetting previously learned knowledge. PAT exhibits significant\nperformance improvements, surpassing the current state-of-the-art by 2.2% in\nthe NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and\nintersection over union value by 2.07%, with a particularly notable declination\nof 0.39% in detecting rare classes compared to Balance Logits Variation, as\ndemonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and\nNYU.\n","authors":["Khoi Do","Duong Nguyen","Nguyen H. Tran","Viet Dung Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05393v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06165v1","updated":"2024-04-09T09:42:18Z","published":"2024-04-09T09:42:18Z","title":"Enhanced Radar Perception via Multi-Task Learning: Towards Refined Data\n for Sensor Fusion Applications","summary":" Radar and camera fusion yields robustness in perception tasks by leveraging\nthe strength of both sensors. The typical extracted radar point cloud is 2D\nwithout height information due to insufficient antennas along the elevation\naxis, which challenges the network performance. This work introduces a\nlearning-based approach to infer the height of radar points associated with 3D\nobjects. A novel robust regression loss is introduced to address the sparse\ntarget challenge. In addition, a multi-task training strategy is employed,\nemphasizing important features. The average radar absolute height error\ndecreases from 1.69 to 0.25 meters compared to the state-of-the-art height\nextension method. The estimated target height values are used to preprocess and\nenrich radar data for downstream perception tasks. Integrating this refined\nradar information further enhances the performance of existing radar camera\nfusion models for object detection and depth estimation tasks.\n","authors":["Huawei Sun","Hao Feng","Gianfranco Mauro","Julius Ott","Georg Stettinger","Lorenzo Servadei","Robert Wille"],"pdf_url":"https://arxiv.org/pdf/2404.06165v1.pdf","comment":"Accepted by IEEE Intelligent Vehicles Symposium (IV 2024)"},{"id":"http://arxiv.org/abs/2404.06155v1","updated":"2024-04-09T09:28:05Z","published":"2024-04-09T09:28:05Z","title":"Efficient and Robust Point Cloud Registration via Heuristics-guided\n Parameter Search","summary":" Estimating the rigid transformation with 6 degrees of freedom based on a\nputative 3D correspondence set is a crucial procedure in point cloud\nregistration. Existing correspondence identification methods usually lead to\nlarge outlier ratios ($>$ 95 $\\%$ is common), underscoring the significance of\nrobust registration methods. Many researchers turn to parameter search-based\nstrategies (e.g., Branch-and-Bround) for robust registration. Although related\nmethods show high robustness, their efficiency is limited to the\nhigh-dimensional search space. This paper proposes a heuristics-guided\nparameter search strategy to accelerate the search while maintaining high\nrobustness. We first sample some correspondences (i.e., heuristics) and then\njust need to sequentially search the feasible regions that make each sample an\ninlier. Our strategy largely reduces the search space and can guarantee\naccuracy with only a few inlier samples, therefore enjoying an excellent\ntrade-off between efficiency and robustness. Since directly parameterizing the\n6-dimensional nonlinear feasible region for efficient search is intractable, we\nconstruct a three-stage decomposition pipeline to reparameterize the feasible\nregion, resulting in three lower-dimensional sub-problems that are easily\nsolvable via our strategy. Besides reducing the searching dimension, our\ndecomposition enables the leverage of 1-dimensional interval stabbing at all\nthree stages for searching acceleration. Moreover, we propose a valid sampling\nstrategy to guarantee our sampling effectiveness, and a compatibility\nverification setup to further accelerate our search. Extensive experiments on\nboth simulated and real-world datasets demonstrate that our approach exhibits\ncomparable robustness with state-of-the-art methods while achieving a\nsignificant efficiency boost.\n","authors":["Tianyu Huang","Haoang Li","Liangzu Peng","Yinlong Liu","Yun-Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06155v1.pdf","comment":"21 pages, 16 figures. Accepted to IEEE Transactions on Pattern\n Analysis and Machine Intelligence, 2024"},{"id":"http://arxiv.org/abs/2404.06154v1","updated":"2024-04-09T09:27:54Z","published":"2024-04-09T09:27:54Z","title":"Concise Plane Arrangements for Low-Poly Surface and Volume Modelling","summary":" Plane arrangements are a useful tool for surface and volume modelling.\nHowever, their main drawback is poor scalability. We introduce two key\nnovelties that enable the construction of plane arrangements for complex\nobjects and entire scenes: an ordering scheme for the plane insertion and the\ndirect use of input points during arrangement construction. Both ingredients\nreduce the number of unwanted splits, resulting in improved scalability of the\nconstruction mechanism by up to two orders of magnitude compared to existing\nalgorithms. We further introduce a remeshing and simplification technique that\nallows us to extract low-polygon surface meshes and lightweight convex\ndecompositions of volumes from the arrangement. We show that our approach leads\nto state-of-the-art results for the aforementioned tasks by comparing it to\nlearning-based and traditional approaches on various different datasets. Our\nimplementation is available at https://github.com/raphaelsulzer/compod .\n","authors":["Raphael Sulzer","Florent Lafarge"],"pdf_url":"https://arxiv.org/pdf/2404.06154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06152v1","updated":"2024-04-09T09:23:04Z","published":"2024-04-09T09:23:04Z","title":"HFNeRF: Learning Human Biomechanic Features with Neural Radiance Fields","summary":" In recent advancements in novel view synthesis, generalizable Neural Radiance\nFields (NeRF) based methods applied to human subjects have shown remarkable\nresults in generating novel views from few images. However, this generalization\nability cannot capture the underlying structural features of the skeleton\nshared across all instances. Building upon this, we introduce HFNeRF: a novel\ngeneralizable human feature NeRF aimed at generating human biomechanic features\nusing a pre-trained image encoder. While previous human NeRF methods have shown\npromising results in the generation of photorealistic virtual avatars, such\nmethods lack underlying human structure or biomechanic features such as\nskeleton or joint information that are crucial for downstream applications\nincluding Augmented Reality (AR)/Virtual Reality (VR). HFNeRF leverages 2D\npre-trained foundation models toward learning human features in 3D using neural\nrendering, and then volume rendering towards generating 2D feature maps. We\nevaluate HFNeRF in the skeleton estimation task by predicting heatmaps as\nfeatures. The proposed method is fully differentiable, allowing to successfully\nlearn color, geometry, and human skeleton in a simultaneous manner. This paper\npresents preliminary results of HFNeRF, illustrating its potential in\ngenerating realistic virtual avatars with biomechanic features using NeRF.\n","authors":["Arnab Dey","Di Yang","Antitza Dantcheva","Jean Martinet"],"pdf_url":"https://arxiv.org/pdf/2404.06152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10634v2","updated":"2024-04-09T09:18:26Z","published":"2023-12-17T07:33:06Z","title":"Anomaly Score: Evaluating Generative Models and Individual Generated\n Images based on Complexity and Vulnerability","summary":" With the advancement of generative models, the assessment of generated images\nbecomes more and more important. Previous methods measure distances between\nfeatures of reference and generated images from trained vision models. In this\npaper, we conduct an extensive investigation into the relationship between the\nrepresentation space and input space around generated images. We first propose\ntwo measures related to the presence of unnatural elements within images:\ncomplexity, which indicates how non-linear the representation space is, and\nvulnerability, which is related to how easily the extracted feature changes by\nadversarial input changes. Based on these, we introduce a new metric to\nevaluating image-generative models called anomaly score (AS). Moreover, we\npropose AS-i (anomaly score for individual images) that can effectively\nevaluate generated images individually. Experimental results demonstrate the\nvalidity of the proposed approach.\n","authors":["Jaehui Hwang","Junghyuk Lee","Jong-Seok Lee"],"pdf_url":"https://arxiv.org/pdf/2312.10634v2.pdf","comment":"Accepted in CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00915v2","updated":"2024-04-09T09:16:29Z","published":"2024-04-01T04:43:39Z","title":"Scalable 3D Registration via Truncated Entry-wise Absolute Residuals","summary":" Given an input set of $3$D point pairs, the goal of outlier-robust $3$D\nregistration is to compute some rotation and translation that align as many\npoint pairs as possible. This is an important problem in computer vision, for\nwhich many highly accurate approaches have been recently proposed. Despite\ntheir impressive performance, these approaches lack scalability, often\noverflowing the $16$GB of memory of a standard laptop to handle roughly\n$30,000$ point pairs. In this paper, we propose a $3$D registration approach\nthat can process more than ten million ($10^7$) point pairs with over $99\\%$\nrandom outliers. Moreover, our method is efficient, entails low memory costs,\nand maintains high accuracy at the same time. We call our method TEAR, as it\ninvolves minimizing an outlier-robust loss that computes Truncated Entry-wise\nAbsolute Residuals. To minimize this loss, we decompose the original\n$6$-dimensional problem into two subproblems of dimensions $3$ and $2$,\nrespectively, solved in succession to global optimality via a customized\nbranch-and-bound method. While branch-and-bound is often slow and unscalable,\nthis does not apply to TEAR as we propose novel bounding functions that are\ntight and computationally efficient. Experiments on various datasets are\nconducted to validate the scalability and efficiency of our method.\n","authors":["Tianyu Huang","Liangzu Peng","René Vidal","Yun-Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2404.00915v2.pdf","comment":"24 pages, 12 figures. Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.08801v2","updated":"2024-04-09T09:13:01Z","published":"2024-02-05T12:33:37Z","title":"CoBra: Complementary Branch Fusing Class and Semantic Knowledge for\n Robust Weakly Supervised Semantic Segmentation","summary":" Leveraging semantically precise pseudo masks derived from image-level class\nknowledge for segmentation, namely image-level Weakly Supervised Semantic\nSegmentation (WSSS), still remains challenging. While Class Activation Maps\n(CAMs) using CNNs have steadily been contributing to the success of WSSS, the\nresulting activation maps often narrowly focus on class-specific parts (e.g.,\nonly face of human). On the other hand, recent works based on vision\ntransformers (ViT) have shown promising results based on their self-attention\nmechanism to capture the semantic parts but fail in capturing complete\nclass-specific details (e.g., entire body parts of human but also with a dog\nnearby). In this work, we propose Complementary Branch (CoBra), a novel dual\nbranch framework consisting of two distinct architectures which provide\nvaluable complementary knowledge of class (from CNN) and semantic (from ViT) to\neach branch. In particular, we learn Class-Aware Projection (CAP) for the CNN\nbranch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly\nfuse their complementary knowledge and facilitate a new type of extra\npatch-level supervision. Our model, through CoBra, fuses CNN and ViT's\ncomplementary outputs to create robust pseudo masks that integrate both class\nand semantic information effectively. Extensive experiments qualitatively and\nquantitatively investigate how CNN and ViT complement each other on the PASCAL\nVOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not\nonly the masks generated by our model, but also the segmentation results\nderived from utilizing these masks as pseudo labels.\n","authors":["Woojung Han","Seil Kang","Kyobin Choo","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.08801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02813v2","updated":"2024-04-09T09:12:58Z","published":"2023-12-05T14:56:55Z","title":"BIVDiff: A Training-Free Framework for General-Purpose Video Synthesis\n via Bridging Image and Video Diffusion Models","summary":" Diffusion models have made tremendous progress in text-driven image and video\ngeneration. Now text-to-image foundation models are widely applied to various\ndownstream image synthesis tasks, such as controllable image generation and\nimage editing, while downstream video synthesis tasks are less explored for\nseveral reasons. First, it requires huge memory and computation overhead to\ntrain a video generation foundation model. Even with video foundation models,\nadditional costly training is still required for downstream video synthesis\ntasks. Second, although some works extend image diffusion models into videos in\na training-free manner, temporal consistency cannot be well preserved. Finally,\nthese adaption methods are specifically designed for one task and fail to\ngeneralize to different tasks. To mitigate these issues, we propose a\ntraining-free general-purpose video synthesis framework, coined as {\\bf\nBIVDiff}, via bridging specific image diffusion models and general\ntext-to-video foundation diffusion models. Specifically, we first use a\nspecific image diffusion model (e.g., ControlNet and Instruct Pix2Pix) for\nframe-wise video generation, then perform Mixed Inversion on the generated\nvideo, and finally input the inverted latents into the video diffusion models\n(e.g., VidRD and ZeroScope) for temporal smoothing. This decoupled framework\nenables flexible image model selection for different purposes with strong task\ngeneralization and high efficiency. To validate the effectiveness and general\nuse of BIVDiff, we perform a wide range of video synthesis tasks, including\ncontrollable video generation, video editing, video inpainting, and\noutpainting.\n","authors":["Fengyuan Shi","Jiaxi Gu","Hang Xu","Songcen Xu","Wei Zhang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02813v2.pdf","comment":"Accepted by CVPR 2024. Project page: https://bivdiff.github.io;\n GitHub repository: https://github.com/MCG-NJU/BIVDiff"},{"id":"http://arxiv.org/abs/2404.06139v1","updated":"2024-04-09T09:05:23Z","published":"2024-04-09T09:05:23Z","title":"DiffHarmony: Latent Diffusion Model Meets Image Harmonization","summary":" Image harmonization, which involves adjusting the foreground of a composite\nimage to attain a unified visual consistency with the background, can be\nconceptualized as an image-to-image translation task. Diffusion models have\nrecently promoted the rapid development of image-to-image translation tasks .\nHowever, training diffusion models from scratch is computationally intensive.\nFine-tuning pre-trained latent diffusion models entails dealing with the\nreconstruction error induced by the image compression autoencoder, making it\nunsuitable for image generation tasks that involve pixel-level evaluation\nmetrics. To deal with these issues, in this paper, we first adapt a pre-trained\nlatent diffusion model to the image harmonization task to generate the\nharmonious but potentially blurry initial images. Then we implement two\nstrategies: utilizing higher-resolution images during inference and\nincorporating an additional refinement stage, to further enhance the clarity of\nthe initially harmonized images. Extensive experiments on iHarmony4 datasets\ndemonstrate the superiority of our proposed method. The code and model will be\nmade publicly available at https://github.com/nicecv/DiffHarmony .\n","authors":["Pengfei Zhou","Fangxiang Feng","Xiaojie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06139v1.pdf","comment":"Accepted by ICMR 2024"},{"id":"http://arxiv.org/abs/2404.06135v1","updated":"2024-04-09T09:02:21Z","published":"2024-04-09T09:02:21Z","title":"Mansformer: Efficient Transformer of Mixed Attention for Image\n Deblurring and Beyond","summary":" Transformer has made an enormous success in natural language processing and\nhigh-level vision over the past few years. However, the complexity of\nself-attention is quadratic to the image size, which makes it infeasible for\nhigh-resolution vision tasks. In this paper, we propose the Mansformer, a\nTransformer of mixed attention that combines multiple self-attentions, gate,\nand multi-layer perceptions (MLPs), to explore and employ more possibilities of\nself-attention. Taking efficiency into account, we design four kinds of\nself-attention, whose complexities are all linear. By elaborate adjustment of\nthe tensor shapes and dimensions for the dot product, we split the typical\nself-attention of quadratic complexity into four operations of linear\ncomplexity. To adaptively merge these different kinds of self-attention, we\ntake advantage of an architecture similar to Squeeze-and-Excitation Networks.\nFurthermore, we make it to merge the two-staged Transformer design into one\nstage by the proposed gated-dconv MLP. Image deblurring is our main target,\nwhile extensive quantitative and qualitative evaluations show that this method\nperforms favorably against the state-of-the-art methods far more than simply\ndeblurring. The source codes and trained models will be made available to the\npublic.\n","authors":["Pin-Hung Kuo","Jinshan Pan","Shao-Yi Chien","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.06135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06128v1","updated":"2024-04-09T08:51:44Z","published":"2024-04-09T08:51:44Z","title":"Gaussian Pancakes: Geometrically-Regularized 3D Gaussian Splatting for\n Realistic Endoscopic Reconstruction","summary":" Within colorectal cancer diagnostics, conventional colonoscopy techniques\nface critical limitations, including a limited field of view and a lack of\ndepth information, which can impede the detection of precancerous lesions.\nCurrent methods struggle to provide comprehensive and accurate 3D\nreconstructions of the colonic surface which can help minimize the missing\nregions and reinspection for pre-cancerous polyps. Addressing this, we\nintroduce 'Gaussian Pancakes', a method that leverages 3D Gaussian Splatting\n(3D GS) combined with a Recurrent Neural Network-based Simultaneous\nLocalization and Mapping (RNNSLAM) system. By introducing geometric and depth\nregularization into the 3D GS framework, our approach ensures more accurate\nalignment of Gaussians with the colon surface, resulting in smoother 3D\nreconstructions with novel viewing of detailed textures and structures.\nEvaluations across three diverse datasets show that Gaussian Pancakes enhances\nnovel view synthesis quality, surpassing current leading methods with a 18%\nboost in PSNR and a 16% improvement in SSIM. It also delivers over 100X faster\nrendering and more than 10X shorter training times, making it a practical tool\nfor real-time applications. Hence, this holds promise for achieving clinical\ntranslation for better detection and diagnosis of colorectal cancer.\n","authors":["Sierra Bonilla","Shuai Zhang","Dimitrios Psychogyios","Danail Stoyanov","Francisco Vasconcelos","Sophia Bano"],"pdf_url":"https://arxiv.org/pdf/2404.06128v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.06124v1","updated":"2024-04-09T08:49:01Z","published":"2024-04-09T08:49:01Z","title":"Hierarchical Insights: Exploiting Structural Similarities for Reliable\n 3D Semantic Segmentation","summary":" Safety-critical applications like autonomous driving call for robust 3D\nenvironment perception algorithms which can withstand highly diverse and\nambiguous surroundings. The predictive performance of any classification model\nstrongly depends on the underlying dataset and the prior knowledge conveyed by\nthe annotated labels. While the labels provide a basis for the learning\nprocess, they usually fail to represent inherent relations between the classes\n- representations, which are a natural element of the human perception system.\nWe propose a training strategy which enables a 3D LiDAR semantic segmentation\nmodel to learn structural relationships between the different classes through\nabstraction. We achieve this by implicitly modeling those relationships through\na learning rule for hierarchical multi-label classification (HMC). With a\ndetailed analysis we show, how this training strategy not only improves the\nmodel's confidence calibration, but also preserves additional information for\ndownstream tasks like fusion, prediction and planning.\n","authors":["Mariella Dreissig","Florian Piewak","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2404.06124v1.pdf","comment":"submitted to IROS 2024"},{"id":"http://arxiv.org/abs/2404.06119v1","updated":"2024-04-09T08:41:13Z","published":"2024-04-09T08:41:13Z","title":"DreamView: Injecting View-specific Text Guidance into Text-to-3D\n Generation","summary":" Text-to-3D generation, which synthesizes 3D assets according to an overall\ntext description, has significantly progressed. However, a challenge arises\nwhen the specific appearances need customizing at designated viewpoints but\nreferring solely to the overall description for generating 3D objects. For\ninstance, ambiguity easily occurs when producing a T-shirt with distinct\npatterns on its front and back using a single overall text guidance. In this\nwork, we propose DreamView, a text-to-image approach enabling multi-view\ncustomization while maintaining overall consistency by adaptively injecting the\nview-specific and overall text guidance through a collaborative text guidance\ninjection module, which can also be lifted to 3D generation via score\ndistillation sampling. DreamView is trained with large-scale rendered\nmulti-view images and their corresponding view-specific texts to learn to\nbalance the separate content manipulation in each view and the global\nconsistency of the overall object, resulting in a dual achievement of\ncustomization and consistency. Consequently, DreamView empowers artists to\ndesign 3D objects creatively, fostering the creation of more innovative and\ndiverse 3D assets. Code and model will be released at\nhttps://github.com/iSEE-Laboratory/DreamView.\n","authors":["Junkai Yan","Yipeng Gao","Qize Yang","Xihan Wei","Xuansong Xie","Ancong Wu","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.06119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06109v1","updated":"2024-04-09T08:20:37Z","published":"2024-04-09T08:20:37Z","title":"Revising Densification in Gaussian Splatting","summary":" In this paper, we address the limitations of Adaptive Density Control (ADC)\nin 3D Gaussian Splatting (3DGS), a scene representation method achieving\nhigh-quality, photorealistic results for novel view synthesis. ADC has been\nintroduced for automatic 3D point primitive management, controlling\ndensification and pruning, however, with certain limitations in the\ndensification logic. Our main contribution is a more principled, pixel-error\ndriven formulation for density control in 3DGS, leveraging an auxiliary,\nper-pixel error function as the criterion for densification. We further\nintroduce a mechanism to control the total number of primitives generated per\nscene and correct a bias in the current opacity handling strategy of ADC during\ncloning operations. Our approach leads to consistent quality improvements\nacross a variety of benchmark scenes, without sacrificing the method's\nefficiency.\n","authors":["Samuel Rota Bulò","Lorenzo Porzi","Peter Kontschieder"],"pdf_url":"https://arxiv.org/pdf/2404.06109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04617v2","updated":"2024-04-09T08:20:08Z","published":"2024-04-06T12:50:08Z","title":"Empowering Image Recovery_ A Multi-Attention Approach","summary":" We propose Diverse Restormer (DART), a novel image restoration method that\neffectively integrates information from various sources (long sequences, local\nand global regions, feature dimensions, and positional dimensions) to address\nrestoration challenges. While Transformer models have demonstrated excellent\nperformance in image restoration due to their self-attention mechanism, they\nface limitations in complex scenarios. Leveraging recent advancements in\nTransformers and various attention mechanisms, our method utilizes customized\nattention mechanisms to enhance overall performance. DART, our novel network\narchitecture, employs windowed attention to mimic the selective focusing\nmechanism of human eyes. By dynamically adjusting receptive fields, it\noptimally captures the fundamental features crucial for image resolution\nreconstruction. Efficiency and performance balance are achieved through the\nLongIR attention mechanism for long sequence image restoration. Integration of\nattention mechanisms across feature and positional dimensions further enhances\nthe recovery of fine details. Evaluation across five restoration tasks\nconsistently positions DART at the forefront. Upon acceptance, we commit to\nproviding publicly accessible code and models to ensure reproducibility and\nfacilitate further research.\n","authors":["Juan Wen","Yawei Li","Chao Zhang","Weiyan Hou","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.04617v2.pdf","comment":"12 pages, 10 figures, 12 tables"},{"id":"http://arxiv.org/abs/2401.13961v2","updated":"2024-04-09T08:07:48Z","published":"2024-01-25T05:50:48Z","title":"TriSAM: Tri-Plane SAM for zero-shot cortical blood vessel segmentation\n in VEM images","summary":" While imaging techniques at macro and mesoscales have garnered substantial\nattention and resources, microscale VEM imaging, capable of revealing intricate\nvascular details, has lacked the necessary benchmarking infrastructure. In this\npaper, we address a significant gap in the field of neuroimaging by introducing\nthe largest-to-date public benchmark, \\textbf{BvEM}, designed specifically for\ncortical blood vessel segmentation in volume electron microscopy (VEM) images.\nOur BvEM benchmark is based on VEM image volumes from three mammal species:\nadult mouse, macaque, and human. We standardized the resolution, addressed\nimaging variations, and meticulously annotated blood vessels through\nsemi-automatic, manual, and quality control processes, ensuring high-quality 3D\nsegmentation. Furthermore, we developed a zero-shot cortical blood vessel\nsegmentation method named TriSAM, which leverages the powerful segmentation\nmodel SAM for 3D segmentation. To extend SAM from 2D to 3D volume segmentation,\nTriSAM employs a multi-seed tracking framework, leveraging the reliability of\ncertain image planes for tracking while using others to identify potential\nturning points. This approach effectively achieves long-term 3D blood vessel\nsegmentation without model training or fine-tuning. Experimental results show\nthat TriSAM achieved superior performances on the BvEM benchmark across three\nspecies.\n","authors":["Jia Wan","Wanhua Li","Jason Ken Adhinarta","Atmadeep Banerjee","Evelina Sjostedt","Jingpeng Wu","Jeff Lichtman","Hanspeter Pfister","Donglai Wei"],"pdf_url":"https://arxiv.org/pdf/2401.13961v2.pdf","comment":"BvEM-Mouse can be visualized at: https://tinyurl.com/yc2s38x9"},{"id":"http://arxiv.org/abs/2403.13358v2","updated":"2024-04-09T07:55:41Z","published":"2024-03-20T07:36:43Z","title":"GeRM: A Generalist Robotic Model with Mixture-of-experts for Quadruped\n Robot","summary":" Multi-task robot learning holds significant importance in tackling diverse\nand complex scenarios. However, current approaches are hindered by performance\nissues and difficulties in collecting training datasets. In this paper, we\npropose GeRM (Generalist Robotic Model). We utilize offline reinforcement\nlearning to optimize data utilization strategies to learn from both\ndemonstrations and sub-optimal data, thus surpassing the limitations of human\ndemonstrations. Thereafter, we employ a transformer-based VLA network to\nprocess multi-modal inputs and output actions. By introducing the\nMixture-of-Experts structure, GeRM allows faster inference speed with higher\nwhole model capacity, and thus resolves the issue of limited RL parameters,\nenhancing model performance in multi-task learning while controlling\ncomputational costs. Through a series of experiments, we demonstrate that GeRM\noutperforms other methods across all tasks, while also validating its\nefficiency in both training and inference processes. Additionally, we uncover\nits potential to acquire emergent skills. Additionally, we contribute the\nQUARD-Auto dataset, collected automatically to support our training approach\nand foster advancements in multi-task quadruped robot learning. This work\npresents a new paradigm for reducing the cost of collecting robot data and\ndriving progress in the multi-task learning community. You can reach our\nproject and video through the link: https://songwxuan.github.io/GeRM/ .\n","authors":["Wenxuan Song","Han Zhao","Pengxiang Ding","Can Cui","Shangke Lyu","Yaning Fan","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.13358v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05970v3","updated":"2024-04-09T07:49:55Z","published":"2023-03-10T15:01:51Z","title":"Exploring Recurrent Long-term Temporal Fusion for Multi-view 3D\n Perception","summary":" Long-term temporal fusion is a crucial but often overlooked technique in\ncamera-based Bird's-Eye-View (BEV) 3D perception. Existing methods are mostly\nin a parallel manner. While parallel fusion can benefit from long-term\ninformation, it suffers from increasing computational and memory overheads as\nthe fusion window size grows. Alternatively, BEVFormer adopts a recurrent\nfusion pipeline so that history information can be efficiently integrated, yet\nit fails to benefit from longer temporal frames. In this paper, we explore an\nembarrassingly simple long-term recurrent fusion strategy built upon the\nLSS-based methods and find it already able to enjoy the merits from both sides,\ni.e., rich long-term information and efficient fusion pipeline. A temporal\nembedding module is further proposed to improve the model's robustness against\noccasionally missed frames in practical scenarios. We name this simple but\neffective fusing pipeline VideoBEV. Experimental results on the nuScenes\nbenchmark show that VideoBEV obtains strong performance on various camera-based\n3D perception tasks, including object detection (55.4\\% mAP and 62.9\\% NDS),\nsegmentation (48.6\\% vehicle mIoU), tracking (54.8\\% AMOTA), and motion\nprediction (0.80m minADE and 0.463 EPA).\n","authors":["Chunrui Han","Jinrong Yang","Jianjian Sun","Zheng Ge","Runpei Dong","Hongyu Zhou","Weixin Mao","Yuang Peng","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.05970v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06091v1","updated":"2024-04-09T07:49:30Z","published":"2024-04-09T07:49:30Z","title":"Hash3D: Training-free Acceleration for 3D Generation","summary":" The evolution of 3D generative modeling has been notably propelled by the\nadoption of 2D diffusion models. Despite this progress, the cumbersome\noptimization process per se presents a critical hurdle to efficiency. In this\npaper, we introduce Hash3D, a universal acceleration for 3D generation without\nmodel training. Central to Hash3D is the insight that feature-map redundancy is\nprevalent in images rendered from camera positions and diffusion time-steps in\nclose proximity. By effectively hashing and reusing these feature maps across\nneighboring timesteps and camera angles, Hash3D substantially prevents\nredundant calculations, thus accelerating the diffusion model's inference in 3D\ngeneration tasks. We achieve this through an adaptive grid-based hashing.\nSurprisingly, this feature-sharing mechanism not only speed up the generation\nbut also enhances the smoothness and view consistency of the synthesized 3D\nobjects. Our experiments covering 5 text-to-3D and 3 image-to-3D models,\ndemonstrate Hash3D's versatility to speed up optimization, enhancing efficiency\nby 1.3 to 4 times. Additionally, Hash3D's integration with 3D Gaussian\nsplatting largely speeds up 3D model creation, reducing text-to-3D processing\nto about 10 minutes and image-to-3D conversion to roughly 30 seconds. The\nproject page is at https://adamdad.github.io/hash3D/.\n","authors":["Xingyi Yang","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06091v1.pdf","comment":"https://adamdad.github.io/hash3D/"},{"id":"http://arxiv.org/abs/2311.17002v3","updated":"2024-04-09T07:46:43Z","published":"2023-11-28T17:57:44Z","title":"Ranni: Taming Text-to-Image Diffusion for Accurate Instruction Following","summary":" Existing text-to-image (T2I) diffusion models usually struggle in\ninterpreting complex prompts, especially those with quantity, object-attribute\nbinding, and multi-subject descriptions. In this work, we introduce a semantic\npanel as the middleware in decoding texts to images, supporting the generator\nto better follow instructions. The panel is obtained through arranging the\nvisual concepts parsed from the input text by the aid of large language models,\nand then injected into the denoising network as a detailed control signal to\ncomplement the text condition. To facilitate text-to-panel learning, we come up\nwith a carefully designed semantic formatting protocol, accompanied by a\nfully-automatic data preparation pipeline. Thanks to such a design, our\napproach, which we call Ranni, manages to enhance a pre-trained T2I generator\nregarding its textual controllability. More importantly, the introduction of\nthe generative middleware brings a more convenient form of interaction (i.e.,\ndirectly adjusting the elements in the panel or using language instructions)\nand further allows users to finely customize their generation, based on which\nwe develop a practical system and showcase its potential in continuous\ngeneration and chatting-based editing. Our project page is at\nhttps://ranni-t2i.github.io/Ranni.\n","authors":["Yutong Feng","Biao Gong","Di Chen","Yujun Shen","Yu Liu","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.17002v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05559v2","updated":"2024-04-09T07:43:29Z","published":"2024-04-08T14:30:42Z","title":"TIM: A Time Interval Machine for Audio-Visual Action Recognition","summary":" Diverse actions give rise to rich audio-visual signals in long videos. Recent\nworks showcase that the two modalities of audio and video exhibit different\ntemporal extents of events and distinct labels. We address the interplay\nbetween the two modalities in long videos by explicitly modelling the temporal\nextents of audio and visual events. We propose the Time Interval Machine (TIM)\nwhere a modality-specific time interval poses as a query to a transformer\nencoder that ingests a long video input. The encoder then attends to the\nspecified interval, as well as the surrounding context in both modalities, in\norder to recognise the ongoing action.\n We test TIM on three long audio-visual video datasets: EPIC-KITCHENS,\nPerception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On\nEPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly\nlarger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we\nshow that TIM can be adapted for action detection, using dense multi-scale\ninterval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and\nshowing strong performance on the Perception Test. Our ablations show the\ncritical role of integrating the two modalities and modelling their time\nintervals in achieving this performance. Code and models at:\nhttps://github.com/JacobChalk/TIM\n","authors":["Jacob Chalk","Jaesung Huh","Evangelos Kazakos","Andrew Zisserman","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2404.05559v2.pdf","comment":"Accepted to CVPR 2024. Project Webpage:\n https://jacobchalk.github.io/TIM-Project"},{"id":"http://arxiv.org/abs/2404.06080v1","updated":"2024-04-09T07:39:21Z","published":"2024-04-09T07:39:21Z","title":"Using Few-Shot Learning to Classify Primary Lung Cancer and Other\n Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial\n Ultrasound Procedures","summary":" This study aims to establish a computer-aided diagnosis system for\nendobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary\ndiagnosis of metastatic cancer. This involves arranging immediate examinations\nfor other sites of metastatic cancer after EBUS surgery, eliminating the need\nto wait for reports, thereby shortening the waiting time by more than half and\nenabling patients to detect other cancers earlier, allowing for early planning\nand implementation of treatment plans. Unlike previous studies on cell image\nclassification, which have abundant datasets for training, this study must also\nbe able to make effective classifications despite the limited amount of case\ndata for lung metastatic cancer. In the realm of small data set classification\nmethods, Few-shot learning (FSL) has become mainstream in recent years. Through\nits ability to train on small datasets and its strong generalization\ncapabilities, FSL shows potential in this task of lung metastatic cell image\nclassification. This study will adopt the approach of Few-shot learning,\nreferencing existing proposed models, and designing a model architecture for\nclassifying lung metastases cell images. Batch Spectral Regularization (BSR)\nwill be incorporated as a loss update parameter, and the Finetune method of PMF\nwill be modified. In terms of test results, the addition of BSR and the\nmodified Finetune method further increases the accuracy by 8.89% to 65.60%,\noutperforming other FSL methods. This study confirms that FSL is superior to\nsupervised and transfer learning in classifying metastatic cancer and\ndemonstrates that using BSR as a loss function and modifying Finetune can\nenhance the model's capabilities.\n","authors":["Ching-Kai Lin","Di-Chun Wei","Yun-Chien Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.06080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07937v4","updated":"2024-04-09T07:31:25Z","published":"2023-12-13T07:30:19Z","title":"BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics","summary":" The recently emerging text-to-motion advances have spired numerous attempts\nfor convenient and interactive human motion generation. Yet, existing methods\nare largely limited to generating body motions only without considering the\nrich two-hand motions, let alone handling various conditions like body dynamics\nor texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal\ndataset for two-hand motion generation. Our dataset includes accurate motion\ntracking for the human body and hands and provides pair-wised finger-level hand\nannotations and body descriptions. We further provide a strong baseline method,\nBOTH2Hands, for the novel task: generating vivid two-hand motions from both\nimplicit body dynamics and explicit text prompts. We first warm up two parallel\nbody-to-hand and text-to-hand diffusion models and then utilize the\ncross-attention transformer for motion blending. Extensive experiments and\ncross-validations demonstrate the effectiveness of our approach and dataset for\ngenerating convincing two-hand motions from the hybrid body-and-textual\nconditions. Our dataset and code will be disseminated to the community for\nfuture research.\n","authors":["Wenqian Zhang","Molin Huang","Yuxuan Zhou","Juze Zhang","Jingyi Yu","Jingya Wang","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.07937v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06075v1","updated":"2024-04-09T07:25:30Z","published":"2024-04-09T07:25:30Z","title":"LIPT: Latency-aware Image Processing Transformer","summary":" Transformer is leading a trend in the field of image processing. Despite the\ngreat success that existing lightweight image processing transformers have\nachieved, they are tailored to FLOPs or parameters reduction, rather than\npractical inference acceleration. In this paper, we present a latency-aware\nimage processing transformer, termed LIPT. We devise the low-latency proportion\nLIPT block that substitutes memory-intensive operators with the combination of\nself-attention and convolutions to achieve practical speedup. Specifically, we\npropose a novel non-volatile sparse masking self-attention (NVSM-SA) that\nutilizes a pre-computing sparse mask to capture contextual information from a\nlarger window with no extra computation overload. Besides, a high-frequency\nreparameterization module (HRM) is proposed to make LIPT block\nreparameterization friendly, which improves the model's detail reconstruction\ncapability. Extensive experiments on multiple image processing tasks (e.g.,\nimage super-resolution (SR), JPEG artifact reduction, and image denoising)\ndemonstrate the superiority of LIPT on both latency and PSNR. LIPT achieves\nreal-time GPU inference with state-of-the-art performance on multiple image SR\nbenchmarks.\n","authors":["Junbo Qiao","Wei Li","Haizhen Xie","Hanting Chen","Yunshuai Zhou","Zhijun Tu","Jie Hu","Shaohui Lin"],"pdf_url":"https://arxiv.org/pdf/2404.06075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03892v2","updated":"2024-04-09T07:21:32Z","published":"2024-04-05T05:00:21Z","title":"Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and\n Integration of Convolutional Neural Networks and Explainable AI","summary":" The study introduces an integrated framework combining Convolutional Neural\nNetworks (CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced\ndiagnosis of breast cancer using the CBIS-DDSM dataset. Utilizing a fine-tuned\nResNet50 architecture, our investigation not only provides effective\ndifferentiation of mammographic images into benign and malignant categories but\nalso addresses the opaque \"black-box\" nature of deep learning models by\nemploying XAI methodologies, namely Grad-CAM, LIME, and SHAP, to interpret CNN\ndecision-making processes for healthcare professionals. Our methodology\nencompasses an elaborate data preprocessing pipeline and advanced data\naugmentation techniques to counteract dataset limitations, and transfer\nlearning using pre-trained networks, such as VGG-16, DenseNet and ResNet was\nemployed. A focal point of our study is the evaluation of XAI's effectiveness\nin interpreting model predictions, highlighted by utilising the Hausdorff\nmeasure to assess the alignment between AI-generated explanations and expert\nannotations quantitatively. This approach plays a critical role for XAI in\npromoting trustworthiness and ethical fairness in AI-assisted diagnostics. The\nfindings from our research illustrate the effective collaboration between CNNs\nand XAI in advancing diagnostic methods for breast cancer, thereby facilitating\na more seamless integration of advanced AI technologies within clinical\nsettings. By enhancing the interpretability of AI-driven decisions, this work\nlays the groundwork for improved collaboration between AI systems and medical\npractitioners, ultimately enriching patient care. Furthermore, the implications\nof our research extend well beyond the current methodologies, advocating for\nsubsequent inquiries into the integration of multimodal data and the refinement\nof AI explanations to satisfy the needs of clinical practice.\n","authors":["Maryam Ahmed","Tooba Bibi","Rizwan Ahmed Khan","Sidra Nasir"],"pdf_url":"https://arxiv.org/pdf/2404.03892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18201v2","updated":"2024-04-09T07:18:41Z","published":"2024-02-28T09:46:56Z","title":"Learning Invariant Inter-pixel Correlations for Superpixel Generation","summary":" Deep superpixel algorithms have made remarkable strides by substituting\nhand-crafted features with learnable ones. Nevertheless, we observe that\nexisting deep superpixel methods, serving as mid-level representation\noperations, remain sensitive to the statistical properties (e.g., color\ndistribution, high-level semantics) embedded within the training dataset.\nConsequently, learnable features exhibit constrained discriminative capability,\nresulting in unsatisfactory pixel grouping performance, particularly in\nuntrainable application scenarios. To address this issue, we propose the\nContent Disentangle Superpixel (CDS) algorithm to selectively separate the\ninvariant inter-pixel correlations and statistical properties, i.e., style\nnoise. Specifically, We first construct auxiliary modalities that are\nhomologous to the original RGB image but have substantial stylistic variations.\nThen, driven by mutual information, we propose the local-grid correlation\nalignment across modalities to reduce the distribution discrepancy of\nadaptively selected features and learn invariant inter-pixel correlations.\nAfterwards, we perform global-style mutual information minimization to enforce\nthe separation of invariant content and train data styles. The experimental\nresults on four benchmark datasets demonstrate the superiority of our approach\nto existing state-of-the-art methods, regarding boundary adherence,\ngeneralization, and efficiency. Code and pre-trained model are available at\nhttps://github.com/rookiie/CDSpixel.\n","authors":["Sen Xu","Shikui Wei","Tao Ruan","Lixin Liao"],"pdf_url":"https://arxiv.org/pdf/2402.18201v2.pdf","comment":"Accepted by AAAI24"},{"id":"http://arxiv.org/abs/2404.06065v1","updated":"2024-04-09T07:08:00Z","published":"2024-04-09T07:08:00Z","title":"Unified Entropy Optimization for Open-Set Test-Time Adaptation","summary":" Test-time adaptation (TTA) aims at adapting a model pre-trained on the\nlabeled source domain to the unlabeled target domain. Existing methods usually\nfocus on improving TTA performance under covariate shifts, while neglecting\nsemantic shifts. In this paper, we delve into a realistic open-set TTA setting\nwhere the target domain may contain samples from unknown classes. Many\nstate-of-the-art closed-set TTA methods perform poorly when applied to open-set\nscenarios, which can be attributed to the inaccurate estimation of data\ndistribution and model confidence. To address these issues, we propose a simple\nbut effective framework called unified entropy optimization (UniEnt), which is\ncapable of simultaneously adapting to covariate-shifted in-distribution (csID)\ndata and detecting covariate-shifted out-of-distribution (csOOD) data.\nSpecifically, UniEnt first mines pseudo-csID and pseudo-csOOD samples from test\ndata, followed by entropy minimization on the pseudo-csID data and entropy\nmaximization on the pseudo-csOOD data. Furthermore, we introduce UniEnt+ to\nalleviate the noise caused by hard data partition leveraging sample-level\nconfidence. Extensive experiments on CIFAR benchmarks and Tiny-ImageNet-C show\nthe superiority of our framework. The code is available at\nhttps://github.com/gaozhengqing/UniEnt\n","authors":["Zhengqing Gao","Xu-Yao Zhang","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06065v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04580v2","updated":"2024-04-09T06:56:02Z","published":"2024-04-06T10:30:31Z","title":"SDFR: Synthetic Data for Face Recognition Competition","summary":" Large-scale face recognition datasets are collected by crawling the Internet\nand without individuals' consent, raising legal, ethical, and privacy concerns.\nWith the recent advances in generative models, recently several works proposed\ngenerating synthetic face recognition datasets to mitigate concerns in\nweb-crawled face recognition datasets. This paper presents the summary of the\nSynthetic Data for Face Recognition (SDFR) Competition held in conjunction with\nthe 18th IEEE International Conference on Automatic Face and Gesture\nRecognition (FG 2024) and established to investigate the use of synthetic data\nfor training face recognition models. The SDFR competition was split into two\ntasks, allowing participants to train face recognition systems using new\nsynthetic datasets and/or existing ones. In the first task, the face\nrecognition backbone was fixed and the dataset size was limited, while the\nsecond task provided almost complete freedom on the model backbone, the\ndataset, and the training pipeline. The submitted models were trained on\nexisting and also new synthetic datasets and used clever methods to improve\ntraining with synthetic data. The submissions were evaluated and ranked on a\ndiverse set of seven benchmarking datasets. The paper gives an overview of the\nsubmitted face recognition models and reports achieved performance compared to\nbaseline models trained on real and synthetic datasets. Furthermore, the\nevaluation of submissions is extended to bias assessment across different\ndemography groups. Lastly, an outlook on the current state of the research in\ntraining face recognition models using synthetic data is presented, and\nexisting problems as well as potential future directions are also discussed.\n","authors":["Hatef Otroshi Shahreza","Christophe Ecabert","Anjith George","Alexander Unnervik","Sébastien Marcel","Nicolò Di Domenico","Guido Borghi","Davide Maltoni","Fadi Boutros","Julia Vogel","Naser Damer","Ángela Sánchez-Pérez"," EnriqueMas-Candela","Jorge Calvo-Zaragoza","Bernardo Biesseck","Pedro Vidal","Roger Granada","David Menotti","Ivan DeAndres-Tame","Simone Maurizio La Cava","Sara Concas","Pietro Melzi","Ruben Tolosana","Ruben Vera-Rodriguez","Gianpaolo Perelli","Giulia Orrù","Gian Luca Marcialis","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2404.04580v2.pdf","comment":"The 18th IEEE International Conference on Automatic Face and Gesture\n Recognition (FG 2024)"},{"id":"http://arxiv.org/abs/2404.06057v1","updated":"2024-04-09T06:47:44Z","published":"2024-04-09T06:47:44Z","title":"Unified Multi-modal Diagnostic Framework with Reconstruction\n Pre-training and Heterogeneity-combat Tuning","summary":" Medical multi-modal pre-training has revealed promise in computer-aided\ndiagnosis by leveraging large-scale unlabeled datasets. However, existing\nmethods based on masked autoencoders mainly rely on data-level reconstruction\ntasks, but lack high-level semantic information. Furthermore, two significant\nheterogeneity challenges hinder the transfer of pre-trained knowledge to\ndownstream tasks, \\textit{i.e.}, the distribution heterogeneity between\npre-training data and downstream data, and the modality heterogeneity within\ndownstream data. To address these challenges, we propose a Unified Medical\nMulti-modal Diagnostic (UMD) framework with tailored pre-training and\ndownstream tuning strategies. Specifically, to enhance the representation\nabilities of vision and language encoders, we propose the Multi-level\nReconstruction Pre-training (MR-Pretrain) strategy, including a feature-level\nand data-level reconstruction, which guides models to capture the semantic\ninformation from masked inputs of different modalities. Moreover, to tackle two\nkinds of heterogeneities during the downstream tuning, we present the\nheterogeneity-combat downstream tuning strategy, which consists of a\nTask-oriented Distribution Calibration (TD-Calib) and a Gradient-guided\nModality Coordination (GM-Coord). In particular, TD-Calib fine-tunes the\npre-trained model regarding the distribution of downstream datasets, and\nGM-Coord adjusts the gradient weights according to the dynamic optimization\nstatus of different modalities. Extensive experiments on five public medical\ndatasets demonstrate the effectiveness of our UMD framework, which remarkably\noutperforms existing approaches on three kinds of downstream tasks.\n","authors":["Yupei Zhang","Li Pan","Qiushi Yang","Tan Li","Zhen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06057v1.pdf","comment":"to be published in IEEE JBHI; Code available at\n https://github.com/helenypzhang/UMD"},{"id":"http://arxiv.org/abs/2404.06050v1","updated":"2024-04-09T06:27:35Z","published":"2024-04-09T06:27:35Z","title":"Incremental Joint Learning of Depth, Pose and Implicit Scene\n Representation on Monocular Camera in Large-scale Scenes","summary":" Dense scene reconstruction for photo-realistic view synthesis has various\napplications, such as VR/AR, autonomous vehicles. However, most existing\nmethods have difficulties in large-scale scenes due to three core challenges:\n\\textit{(a) inaccurate depth input.} Accurate depth input is impossible to get\nin real-world large-scale scenes. \\textit{(b) inaccurate pose estimation.} Most\nexisting approaches rely on accurate pre-estimated camera poses. \\textit{(c)\ninsufficient scene representation capability.} A single global radiance field\nlacks the capacity to effectively scale to large-scale scenes. To this end, we\npropose an incremental joint learning framework, which can achieve accurate\ndepth, pose estimation, and large-scale scene reconstruction. A vision\ntransformer-based network is adopted as the backbone to enhance performance in\nscale information estimation. For pose estimation, a feature-metric bundle\nadjustment (FBA) method is designed for accurate and robust camera tracking in\nlarge-scale scenes. In terms of implicit scene representation, we propose an\nincremental scene representation method to construct the entire large-scale\nscene as multiple local radiance fields to enhance the scalability of 3D scene\nrepresentation. Extended experiments have been conducted to demonstrate the\neffectiveness and accuracy of our method in depth estimation, pose estimation,\nand large-scale scene reconstruction.\n","authors":["Tianchen Deng","Nailin Wang","Chongdi Wang","Shenghai Yuan","Jingchuan Wang","Danwei Wang","Weidong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04421v2","updated":"2024-04-09T06:23:35Z","published":"2024-04-05T21:44:57Z","title":"PhysAvatar: Learning the Physics of Dressed 3D Avatars from Visual\n Observations","summary":" Modeling and rendering photorealistic avatars is of crucial importance in\nmany applications. Existing methods that build a 3D avatar from visual\nobservations, however, struggle to reconstruct clothed humans. We introduce\nPhysAvatar, a novel framework that combines inverse rendering with inverse\nphysics to automatically estimate the shape and appearance of a human from\nmulti-view video data along with the physical parameters of the fabric of their\nclothes. For this purpose, we adopt a mesh-aligned 4D Gaussian technique for\nspatio-temporal mesh tracking as well as a physically based inverse renderer to\nestimate the intrinsic material properties. PhysAvatar integrates a physics\nsimulator to estimate the physical parameters of the garments using\ngradient-based optimization in a principled manner. These novel capabilities\nenable PhysAvatar to create high-quality novel-view renderings of avatars\ndressed in loose-fitting clothes under motions and lighting conditions not seen\nin the training data. This marks a significant advancement towards modeling\nphotorealistic digital humans using physically based inverse rendering with\nphysics in the loop. Our project website is at:\nhttps://qingqing-zhao.github.io/PhysAvatar\n","authors":["Yang Zheng","Qingqing Zhao","Guandao Yang","Wang Yifan","Donglai Xiang","Florian Dubost","Dmitry Lagun","Thabo Beeler","Federico Tombari","Leonidas Guibas","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2404.04421v2.pdf","comment":"Project Page: https://qingqing-zhao.github.io/PhysAvatar"},{"id":"http://arxiv.org/abs/2404.06044v1","updated":"2024-04-09T06:10:15Z","published":"2024-04-09T06:10:15Z","title":"Object Dynamics Modeling with Hierarchical Point Cloud-based\n Representations","summary":" Modeling object dynamics with a neural network is an important problem with\nnumerous applications. Most recent work has been based on graph neural\nnetworks. However, physics happens in 3D space, where geometric information\npotentially plays an important role in modeling physical phenomena. In this\nwork, we propose a novel U-net architecture based on continuous point\nconvolution which naturally embeds information from 3D coordinates and allows\nfor multi-scale feature representations with established downsampling and\nupsampling procedures. Bottleneck layers in the downsampled point clouds lead\nto better long-range interaction modeling. Besides, the flexibility of point\nconvolutions allows our approach to generalize to sparsely sampled points from\nmesh vertices and dynamically generate features on important interaction points\non mesh faces. Experimental results demonstrate that our approach significantly\nimproves the state-of-the-art, especially in scenarios that require accurate\ngravity or collision reasoning.\n","authors":["Chanho Kim","Li Fuxin"],"pdf_url":"https://arxiv.org/pdf/2404.06044v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2306.11729v2","updated":"2024-04-09T05:57:18Z","published":"2023-06-20T17:57:23Z","title":"Dense Video Object Captioning from Disjoint Supervision","summary":" We propose a new task and model for dense video object captioning --\ndetecting, tracking and captioning trajectories of objects in a video. This\ntask unifies spatial and temporal localization in video, whilst also requiring\nfine-grained visual understanding that is best described by natural language.\nWe propose a unified model, and demonstrate how our end-to-end approach is more\naccurate and temporally coherent than a multi-stage pipeline combining\nstate-of-the-art detection, tracking, and captioning models. Moreover, we\npropose a training strategy based on a mixture of disjoint tasks, which allows\nus to leverage diverse, large-scale datasets which supervise different parts of\nour model. Although each pretraining task only provides weak supervision, they\nare complementary and, when combined, result in noteworthy zero-shot ability\nand serve as strong initialization for additional finetuning to further improve\naccuracy. We carefully design new metrics capturing all components of our task,\nand show how we can repurpose existing video grounding datasets (e.g. VidSTG\nand VLN) for our new task. We show that our model improves upon a number of\nstrong baselines for this new task. Furthermore, we can apply our model to the\ntask of spatial grounding, outperforming prior state-of-the-art on VidSTG and\nVLN, without explicitly training for it. Code is available at\nhttps://github.com/google-research/scenic/tree/main/scenic/projects/densevoc.\n","authors":["Xingyi Zhou","Anurag Arnab","Chen Sun","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2306.11729v2.pdf","comment":"Code is available at\n https://github.com/google-research/scenic/tree/main/scenic/projects/densevoc"},{"id":"http://arxiv.org/abs/2404.06036v1","updated":"2024-04-09T05:49:04Z","published":"2024-04-09T05:49:04Z","title":"Space-Time Video Super-resolution with Neural Operator","summary":" This paper addresses the task of space-time video super-resolution (ST-VSR).\nExisting methods generally suffer from inaccurate motion estimation and motion\ncompensation (MEMC) problems for large motions. Inspired by recent progress in\nphysics-informed neural networks, we model the challenges of MEMC in ST-VSR as\na mapping between two continuous function spaces. Specifically, our approach\ntransforms independent low-resolution representations in the coarse-grained\ncontinuous function space into refined representations with enriched\nspatiotemporal details in the fine-grained continuous function space. To\nachieve efficient and accurate MEMC, we design a Galerkin-type attention\nfunction to perform frame alignment and temporal interpolation. Due to the\nlinear complexity of the Galerkin-type attention mechanism, our model avoids\npatch partitioning and offers global receptive fields, enabling precise\nestimation of large motions. The experimental results show that the proposed\nmethod surpasses state-of-the-art techniques in both fixed-size and continuous\nspace-time video super-resolution tasks.\n","authors":["Yuantong Zhang","Hanyou Zheng","Daiqin Yang","Zhenzhong Chen","Haichuan Ma","Wenpeng Ding"],"pdf_url":"https://arxiv.org/pdf/2404.06036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10473v4","updated":"2024-04-09T05:47:57Z","published":"2023-02-21T06:31:53Z","title":"Oriented Object Detection in Optical Remote Sensing Images using Deep\n Learning: A Survey","summary":" Oriented object detection is one of the most fundamental and challenging\ntasks in remote sensing, aiming to locate and classify objects with arbitrary\norientations. Recent years have witnessed remarkable progress in oriented\nobject detection using deep learning techniques. Given the rapid development of\nthis field, this paper aims to provide a comprehensive survey of recent\nadvances in oriented object detection. To be specific, we first review the\ntechnical evolution from horizontal object detection to oriented object\ndetection and summarize the specific challenges, including feature\nmisalignment, spatial misalignment, and periodicity of angle. Subsequently, we\nfurther categorize existing methods into detection framework, oriented bounding\nbox (OBB) regression, and feature representations, and discuss how these\nmethods address the above challenges in detail. In addition, we cover several\npublicly available datasets and performance evaluation protocols. Furthermore,\nwe provide a comprehensive comparison and analysis of state-of-the-art oriented\nobject detection methods. Toward the end of this paper, we discuss several\nfuture directions for oriented object detection.\n","authors":["Kun Wang","Zi Wang","Zhang Li","Ang Su","Xichao Teng","Minhao Liu","Qifeng Yu"],"pdf_url":"https://arxiv.org/pdf/2302.10473v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06668v2","updated":"2024-04-09T05:47:39Z","published":"2024-03-11T12:36:14Z","title":"PeerAiD: Improving Adversarial Distillation from a Specialized Peer\n Tutor","summary":" Adversarial robustness of the neural network is a significant concern when it\nis applied to security-critical domains. In this situation, adversarial\ndistillation is a promising option which aims to distill the robustness of the\nteacher network to improve the robustness of a small student network. Previous\nworks pretrain the teacher network to make it robust to the adversarial\nexamples aimed at itself. However, the adversarial examples are dependent on\nthe parameters of the target network. The fixed teacher network inevitably\ndegrades its robustness against the unseen transferred adversarial examples\nwhich targets the parameters of the student network in the adversarial\ndistillation process. We propose PeerAiD to make a peer network learn the\nadversarial examples of the student network instead of adversarial examples\naimed at itself. PeerAiD is an adversarial distillation that trains the peer\nnetwork and the student network simultaneously in order to make the peer\nnetwork specialized for defending the student network. We observe that such\npeer networks surpass the robustness of pretrained robust teacher network\nagainst student-attacked adversarial samples. With this peer network and\nadversarial distillation, PeerAiD achieves significantly higher robustness of\nthe student network with AutoAttack (AA) accuracy up to 1.66%p and improves the\nnatural accuracy of the student network up to 4.72%p with ResNet-18 and\nTinyImageNet dataset.\n","authors":["Jaewon Jung","Hongsun Jang","Jaeyong Song","Jinho Lee"],"pdf_url":"https://arxiv.org/pdf/2403.06668v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.06033v1","updated":"2024-04-09T05:44:00Z","published":"2024-04-09T05:44:00Z","title":"Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for\n Multi-exposure Image Fusion","summary":" In recent years, deep learning networks have made remarkable strides in the\ndomain of multi-exposure image fusion. Nonetheless, prevailing approaches often\ninvolve directly feeding over-exposed and under-exposed images into the\nnetwork, which leads to the under-utilization of inherent information present\nin the source images. Additionally, unsupervised techniques predominantly\nemploy rudimentary weighted summation for color channel processing, culminating\nin an overall desaturated final image tone. To partially mitigate these issues,\nthis study proposes a gamma correction module specifically designed to fully\nleverage latent information embedded within source images. Furthermore, a\nmodified transformer block, embracing with self-attention mechanisms, is\nintroduced to optimize the fusion process. Ultimately, a novel color\nenhancement algorithm is presented to augment image saturation while preserving\nintricate details. The source code is available at this https://github.com/ZhiyingDu/BHFMEF url.\n","authors":["Pan Mu","Zhiying Du","Jinyuan Liu","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2404.06033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06029v1","updated":"2024-04-09T05:30:58Z","published":"2024-04-09T05:30:58Z","title":"Improving Facial Landmark Detection Accuracy and Efficiency with\n Knowledge Distillation","summary":" The domain of computer vision has experienced significant advancements in\nfacial-landmark detection, becoming increasingly essential across various\napplications such as augmented reality, facial recognition, and emotion\nanalysis. Unlike object detection or semantic segmentation, which focus on\nidentifying objects and outlining boundaries, faciallandmark detection aims to\nprecisely locate and track critical facial features. However, deploying deep\nlearning-based facial-landmark detection models on embedded systems with\nlimited computational resources poses challenges due to the complexity of\nfacial features, especially in dynamic settings. Additionally, ensuring\nrobustness across diverse ethnicities and expressions presents further\nobstacles. Existing datasets often lack comprehensive representation of facial\nnuances, particularly within populations like those in Taiwan. This paper\nintroduces a novel approach to address these challenges through the development\nof a knowledge distillation method. By transferring knowledge from larger\nmodels to smaller ones, we aim to create lightweight yet powerful deep learning\nmodels tailored specifically for facial-landmark detection tasks. Our goal is\nto design models capable of accurately locating facial landmarks under varying\nconditions, including diverse expressions, orientations, and lighting\nenvironments. The ultimate objective is to achieve high accuracy and real-time\nperformance suitable for deployment on embedded systems. This method was\nsuccessfully implemented and achieved a top 6th place finish out of 165\nparticipants in the IEEE ICME 2024 PAIR competition.\n","authors":["Zong-Wei Hong","Yu-Chen Lin"],"pdf_url":"https://arxiv.org/pdf/2404.06029v1.pdf","comment":"technical report. 6th/165 in IEEE ICME 2024 PAIR competition"},{"id":"http://arxiv.org/abs/2404.06025v1","updated":"2024-04-09T05:21:32Z","published":"2024-04-09T05:21:32Z","title":"Greedy-DiM: Greedy Algorithms for Unreasonably Effective Face Morphs","summary":" Morphing attacks are an emerging threat to state-of-the-art Face Recognition\n(FR) systems, which aim to create a single image that contains the biometric\ninformation of multiple identities. Diffusion Morphs (DiM) are a recently\nproposed morphing attack that has achieved state-of-the-art performance for\nrepresentation-based morphing attacks. However, none of the existing research\non DiMs have leveraged the iterative nature of DiMs and left the DiM model as a\nblack box, treating it no differently than one would a Generative Adversarial\nNetwork (GAN) or Varational AutoEncoder (VAE). We propose a greedy strategy on\nthe iterative sampling process of DiM models which searches for an optimal step\nguided by an identity-based heuristic function. We compare our proposed\nalgorithm against ten other state-of-the-art morphing algorithms using the\nopen-source SYN-MAD 2022 competition dataset. We find that our proposed\nalgorithm is unreasonably effective, fooling all of the tested FR systems with\nan MMPMR of 100%, outperforming all other morphing algorithms compared.\n","authors":["Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06025v1.pdf","comment":"Initial preprint. Under review"},{"id":"http://arxiv.org/abs/2404.06022v1","updated":"2024-04-09T05:11:28Z","published":"2024-04-09T05:11:28Z","title":"Band-Attention Modulated RetNet for Face Forgery Detection","summary":" The transformer networks are extensively utilized in face forgery detection\ndue to their scalability across large datasets.Despite their success,\ntransformers face challenges in balancing the capture of global context, which\nis crucial for unveiling forgery clues, with computational complexity.To\nmitigate this issue, we introduce Band-Attention modulated RetNet (BAR-Net), a\nlightweight network designed to efficiently process extensive visual contexts\nwhile avoiding catastrophic forgetting.Our approach empowers the target token\nto perceive global information by assigning differential attention levels to\ntokens at varying distances. We implement self-attention along both spatial\naxes, thereby maintaining spatial priors and easing the computational\nburden.Moreover, we present the adaptive frequency Band-Attention Modulation\nmechanism, which treats the entire Discrete Cosine Transform spectrogram as a\nseries of frequency bands with learnable weights.Together, BAR-Net achieves\nfavorable performance on several face forgery datasets, outperforming current\nstate-of-the-art methods.\n","authors":["Zhida Zhang","Jie Cao","Wenkui Yang","Qihang Fan","Kai Zhou","Ran He"],"pdf_url":"https://arxiv.org/pdf/2404.06022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16271v4","updated":"2024-04-09T05:09:56Z","published":"2024-03-24T19:32:39Z","title":"Object Detectors in the Open Environment: Challenges, Solutions, and\n Outlook","summary":" With the emergence of foundation models, deep learning-based object detectors\nhave shown practical usability in closed set scenarios. However, for real-world\ntasks, object detectors often operate in open environments, where crucial\nfactors (e.g., data distribution, objective) that influence model learning are\noften changing. The dynamic and intricate nature of the open environment poses\nnovel and formidable challenges to object detectors. Unfortunately, current\nresearch on object detectors in open environments lacks a comprehensive\nanalysis of their distinctive characteristics, challenges, and corresponding\nsolutions, which hinders their secure deployment in critical real-world\nscenarios. This paper aims to bridge this gap by conducting a comprehensive\nreview and analysis of object detectors in open environments. We initially\nidentified limitations of key structural components within the existing\ndetection pipeline and propose the open environment object detector challenge\nframework that includes four quadrants (i.e., out-of-domain, out-of-category,\nrobust learning, and incremental learning) based on the dimensions of the data\n/ target changes. For each quadrant of challenges in the proposed framework, we\npresent a detailed description and systematic analysis of the overarching goals\nand core difficulties, systematically review the corresponding solutions, and\nbenchmark their performance over multiple widely adopted datasets. In addition,\nwe engage in a discussion of open problems and potential avenues for future\nresearch. This paper aims to provide a fresh, comprehensive, and systematic\nunderstanding of the challenges and solutions associated with open-environment\nobject detectors, thus catalyzing the development of more solid applications in\nreal-world scenarios. A project related to this survey can be found at\nhttps://github.com/LiangSiyuan21/OEOD_Survey.\n","authors":["Siyuan Liang","Wei Wang","Ruoyu Chen","Aishan Liu","Boxi Wu","Ee-Chien Chang","Xiaochun Cao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.16271v4.pdf","comment":"37 pages, 17 figures"},{"id":"http://arxiv.org/abs/2312.13980v2","updated":"2024-04-09T04:41:53Z","published":"2023-12-21T16:10:33Z","title":"Carve3D: Improving Multi-view Reconstruction Consistency for Diffusion\n Models with RL Finetuning","summary":" Multi-view diffusion models, obtained by applying Supervised Finetuning (SFT)\nto text-to-image diffusion models, have driven recent breakthroughs in\ntext-to-3D research. However, due to the limited size and quality of existing\n3D datasets, they still suffer from multi-view inconsistencies and Neural\nRadiance Field (NeRF) reconstruction artifacts. We argue that multi-view\ndiffusion models can benefit from further Reinforcement Learning Finetuning\n(RLFT), which allows models to learn from the data generated by themselves and\nimprove beyond their dataset limitations during SFT. To this end, we introduce\nCarve3D, an improved RLFT algorithm coupled with a novel Multi-view\nReconstruction Consistency (MRC) metric, to enhance the consistency of\nmulti-view diffusion models. To measure the MRC metric on a set of multi-view\nimages, we compare them with their corresponding NeRF renderings at the same\ncamera viewpoints. The resulting model, which we denote as Carve3DM,\ndemonstrates superior multi-view consistency and NeRF reconstruction quality\nthan existing models. Our results suggest that pairing SFT with Carve3D's RLFT\nis essential for developing multi-view-consistent diffusion models, mirroring\nthe standard Large Language Model (LLM) alignment pipeline. Our code, training\nand testing data, and video results are available at:\nhttps://desaixie.github.io/carve-3d.\n","authors":["Desai Xie","Jiahao Li","Hao Tan","Xin Sun","Zhixin Shu","Yi Zhou","Sai Bi","Sören Pirk","Arie E. Kaufman"],"pdf_url":"https://arxiv.org/pdf/2312.13980v2.pdf","comment":"22 pages, 16 figures. Our code, training and testing data, and video\n results are available at: https://desaixie.github.io/carve-3d. This paper has\n been accepted to CVPR 2024. v2: incorporated changes from the CVPR 2024\n camera-ready version"},{"id":"http://arxiv.org/abs/2404.06012v1","updated":"2024-04-09T04:41:05Z","published":"2024-04-09T04:41:05Z","title":"Diffusion-Based Point Cloud Super-Resolution for mmWave Radar Data","summary":" The millimeter-wave radar sensor maintains stable performance under adverse\nenvironmental conditions, making it a promising solution for all-weather\nperception tasks, such as outdoor mobile robotics. However, the radar point\nclouds are relatively sparse and contain massive ghost points, which greatly\nlimits the development of mmWave radar technology. In this paper, we propose a\nnovel point cloud super-resolution approach for 3D mmWave radar data, named\nRadar-diffusion. Our approach employs the diffusion model defined by\nmean-reverting stochastic differential equations(SDE). Using our proposed new\nobjective function with supervision from corresponding LiDAR point clouds, our\napproach efficiently handles radar ghost points and enhances the sparse mmWave\nradar point clouds to dense LiDAR-like point clouds. We evaluate our approach\non two different datasets, and the experimental results show that our method\noutperforms the state-of-the-art baseline methods in 3D radar super-resolution\ntasks. Furthermore, we demonstrate that our enhanced radar point cloud is\ncapable of downstream radar point-based registration tasks.\n","authors":["Kai Luan","Chenghao Shi","Neng Wang","Yuwei Cheng","Huimin Lu","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05997v1","updated":"2024-04-09T04:04:50Z","published":"2024-04-09T04:04:50Z","title":"Concept-Attention Whitening for Interpretable Skin Lesion Diagnosis","summary":" The black-box nature of deep learning models has raised concerns about their\ninterpretability for successful deployment in real-world clinical applications.\nTo address the concerns, eXplainable Artificial Intelligence (XAI) aims to\nprovide clear and understandable explanations of the decision-making process.\nIn the medical domain, concepts such as attributes of lesions or abnormalities\nserve as key evidence for deriving diagnostic results. However, existing\nconcept-based models mainly depend on concepts that appear independently and\nrequire fine-grained concept annotations such as bounding boxes. A medical\nimage usually contains multiple concepts and the fine-grained concept\nannotations are difficult to acquire. In this paper, we propose a novel\nConcept-Attention Whitening (CAW) framework for interpretable skin lesion\ndiagnosis. CAW is comprised of a disease diagnosis branch and a concept\nalignment branch. In the former branch, we train the CNN with a CAW layer\ninserted to perform skin lesion diagnosis. The CAW layer decorrelates features\nand aligns image features to conceptual meanings via an orthogonal matrix. In\nthe latter branch, we calculate the orthogonal matrix under the guidance of the\nconcept attention mask. We particularly introduce a weakly-supervised concept\nmask generator that only leverages coarse concept labels for filtering local\nregions that are relevant to certain concepts, improving the optimization of\nthe orthogonal matrix. Extensive experiments on two public skin lesion\ndiagnosis datasets demonstrated that CAW not only enhanced interpretability but\nalso maintained a state-of-the-art diagnostic performance.\n","authors":["Junlin Hou","Jilan Xu","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05981v1","updated":"2024-04-09T03:27:09Z","published":"2024-04-09T03:27:09Z","title":"A Lightweight Measure of Classification Difficulty from Application\n Dataset Characteristics","summary":" Despite accuracy and computation benchmarks being widely available to help\nchoose among neural network models, these are usually trained on datasets with\nmany classes, and do not give a precise idea of performance for applications of\nfew (< 10) classes. The conventional procedure to predict performance is to\ntrain and test repeatedly on the different models and dataset variations of\ninterest. However, this is computationally expensive. We propose an efficient\nclassification difficulty measure that is calculated from the number of classes\nand intra- and inter-class similarity metrics of the dataset. After a single\nstage of training and testing per model family, relative performance for\ndifferent datasets and models of the same family can be predicted by comparing\ndifficulty measures - without further training and testing. We show how this\nmeasure can help a practitioner select a computationally efficient model for a\nsmall dataset 6 to 29x faster than through repeated training and testing. We\ngive an example of use of the measure for an industrial application in which\noptions are identified to select a model 42% smaller than the baseline\nYOLOv5-nano model, and if class merging from 3 to 2 classes meets requirements,\n85% smaller.\n","authors":["Bryan Bo Cao","Abhinav Sharma","Lawrence O'Gorman","Michael Coss","Shubham Jain"],"pdf_url":"https://arxiv.org/pdf/2404.05981v1.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.05980v1","updated":"2024-04-09T03:24:10Z","published":"2024-04-09T03:24:10Z","title":"Tackling Structural Hallucination in Image Translation with Local\n Diffusion","summary":" Recent developments in diffusion models have advanced conditioned image\ngeneration, yet they struggle with reconstructing out-of-distribution (OOD)\nimages, such as unseen tumors in medical images, causing ``image\nhallucination'' and risking misdiagnosis. We hypothesize such hallucinations\nresult from local OOD regions in the conditional images. We verify that\npartitioning the OOD region and conducting separate image generations\nalleviates hallucinations in several applications. From this, we propose a\ntraining-free diffusion framework that reduces hallucination with multiple\nLocal Diffusion processes. Our approach involves OOD estimation followed by two\nmodules: a ``branching'' module generates locally both within and outside OOD\nregions, and a ``fusion'' module integrates these predictions into one. Our\nevaluation shows our method mitigates hallucination over baseline models\nquantitatively and qualitatively, reducing misdiagnosis by 40% and 25% in the\nreal-world medical and natural image datasets, respectively. It also\ndemonstrates compatibility with various pre-trained diffusion models.\n","authors":["Seunghoi Kim","Chen Jin","Tom Diethe","Matteo Figini","Henry F. J. Tregidgo","Asher Mullokandov","Philip Teare","Daniel C. Alexander"],"pdf_url":"https://arxiv.org/pdf/2404.05980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05979v1","updated":"2024-04-09T03:22:36Z","published":"2024-04-09T03:22:36Z","title":"StoryImager: A Unified and Efficient Framework for Coherent Story\n Visualization and Completion","summary":" Story visualization aims to generate a series of realistic and coherent\nimages based on a storyline. Current models adopt a frame-by-frame architecture\nby transforming the pre-trained text-to-image model into an auto-regressive\nmanner. Although these models have shown notable progress, there are still\nthree flaws. 1) The unidirectional generation of auto-regressive manner\nrestricts the usability in many scenarios. 2) The additional introduced story\nhistory encoders bring an extremely high computational cost. 3) The story\nvisualization and continuation models are trained and inferred independently,\nwhich is not user-friendly. To these ends, we propose a bidirectional, unified,\nand efficient framework, namely StoryImager. The StoryImager enhances the\nstoryboard generative ability inherited from the pre-trained text-to-image\nmodel for a bidirectional generation. Specifically, we introduce a Target Frame\nMasking Strategy to extend and unify different story image generation tasks.\nFurthermore, we propose a Frame-Story Cross Attention Module that decomposes\nthe cross attention for local fidelity and global coherence. Moreover, we\ndesign a Contextual Feature Extractor to extract contextual information from\nthe whole storyline. The extensive experimental results demonstrate the\nexcellent performance of our StoryImager. The code is available at\nhttps://github.com/tobran/StoryImager.\n","authors":["Ming Tao","Bing-Kun Bao","Hao Tang","Yaowei Wang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2404.05979v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2403.14085v2","updated":"2024-04-09T02:59:41Z","published":"2024-03-21T02:31:17Z","title":"Surface Reconstruction from Point Clouds via Grid-based Intersection\n Prediction","summary":" Surface reconstruction from point clouds is a crucial task in the fields of\ncomputer vision and computer graphics. SDF-based methods excel at\nreconstructing smooth meshes with minimal error and artefacts but struggle with\nrepresenting open surfaces. On the other hand, UDF-based methods can\neffectively represent open surfaces but often introduce noise, leading to\nartefacts in the mesh. In this work, we propose a novel approach that directly\npredicts the intersection points between line segment of point pairs and\nimplicit surfaces. To achieve it, we propose two modules named Relative\nIntersection Module and Sign Module respectively with the feature of point pair\nas input. To preserve the continuity of the surface, we also integrate symmetry\ninto the two modules, which means the position of predicted intersection will\nnot change even if the input order of the point pair changes. This method not\nonly preserves the ability to represent open surfaces but also eliminates most\nartefacts on the mesh. Our approach demonstrates state-of-the-art performance\non three datasets: ShapeNet, MGN, and ScanNet. The code will be made available\nupon acceptance.\n","authors":["Hui Tian","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2403.14085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03394v2","updated":"2024-04-09T02:56:27Z","published":"2024-04-04T11:53:37Z","title":"Background Noise Reduction of Attention Map for Weakly Supervised\n Semantic Segmentation","summary":" In weakly-supervised semantic segmentation (WSSS) using only image-level\nclass labels, a problem with CNN-based Class Activation Maps (CAM) is that they\ntend to activate the most discriminative local regions of objects. On the other\nhand, methods based on Transformers learn global features but suffer from the\nissue of background noise contamination. This paper focuses on addressing the\nissue of background noise in attention weights within the existing WSSS method\nbased on Conformer, known as TransCAM. The proposed method successfully reduces\nbackground noise, leading to improved accuracy of pseudo labels. Experimental\nresults demonstrate that our model achieves segmentation performance of 70.5%\non the PASCAL VOC 2012 validation data, 71.1% on the test data, and 45.9% on MS\nCOCO 2014 data, outperforming TransCAM in terms of segmentation performance.\n","authors":["Izumi Fujimori","Masaki Oono","Masami Shishibori"],"pdf_url":"https://arxiv.org/pdf/2404.03394v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05967v1","updated":"2024-04-09T02:55:12Z","published":"2024-04-09T02:55:12Z","title":"JSTR: Judgment Improves Scene Text Recognition","summary":" In this paper, we present a method for enhancing the accuracy of scene text\nrecognition tasks by judging whether the image and text match each other. While\nprevious studies focused on generating the recognition results from input\nimages, our approach also considers the model's misrecognition results to\nunderstand its error tendencies, thus improving the text recognition pipeline.\nThis method boosts text recognition accuracy by providing explicit feedback on\nthe data that the model is likely to misrecognize by predicting correct or\nincorrect between the image and text. The experimental results on publicly\navailable datasets demonstrate that our proposed method outperforms the\nbaseline and state-of-the-art methods in scene text recognition.\n","authors":["Masato Fujitake"],"pdf_url":"https://arxiv.org/pdf/2404.05967v1.pdf","comment":"IntelliSys 2024"},{"id":"http://arxiv.org/abs/2404.05960v1","updated":"2024-04-09T02:47:52Z","published":"2024-04-09T02:47:52Z","title":"EasyTrack: Efficient and Compact One-stream 3D Point Clouds Tracker","summary":" Most of 3D single object trackers (SOT) in point clouds follow the two-stream\nmulti-stage 3D Siamese or motion tracking paradigms, which process the template\nand search area point clouds with two parallel branches, built on supervised\npoint cloud backbones. In this work, beyond typical 3D Siamese or motion\ntracking, we propose a neat and compact one-stream transformer 3D SOT paradigm\nfrom the novel perspective, termed as \\textbf{EasyTrack}, which consists of\nthree special designs: 1) A 3D point clouds tracking feature pre-training\nmodule is developed to exploit the masked autoencoding for learning 3D point\nclouds tracking representations. 2) A unified 3D tracking feature learning and\nfusion network is proposed to simultaneously learns target-aware 3D features,\nand extensively captures mutual correlation through the flexible self-attention\nmechanism. 3) A target location network in the dense bird's eye view (BEV)\nfeature space is constructed for target classification and regression.\nMoreover, we develop an enhanced version named EasyTrack++, which designs the\ncenter points interaction (CPI) strategy to reduce the ambiguous targets caused\nby the noise point cloud background information. The proposed EasyTrack and\nEasyTrack++ set a new state-of-the-art performance ($\\textbf{18\\%}$,\n$\\textbf{40\\%}$ and $\\textbf{3\\%}$ success gains) in KITTI, NuScenes, and Waymo\nwhile runing at \\textbf{52.6fps} with few parameters (\\textbf{1.3M}). The code\nwill be available at https://github.com/KnightApple427/Easytrack.\n","authors":["Baojie Fan","Wuyang Zhou","Kai Wang","Shijun Zhou","Fengyu Xu","Jiandong Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12554v4","updated":"2024-04-09T02:42:28Z","published":"2023-01-29T22:05:28Z","title":"Improving the Accuracy-Robustness Trade-Off of Classifiers via Adaptive\n Smoothing","summary":" While prior research has proposed a plethora of methods that build neural\nclassifiers robust against adversarial robustness, practitioners are still\nreluctant to adopt them due to their unacceptably severe clean accuracy\npenalties. This paper significantly alleviates this accuracy-robustness\ntrade-off by mixing the output probabilities of a standard classifier and a\nrobust classifier, where the standard network is optimized for clean accuracy\nand is not robust in general. We show that the robust base classifier's\nconfidence difference for correct and incorrect examples is the key to this\nimprovement. In addition to providing intuitions and empirical evidence, we\ntheoretically certify the robustness of the mixed classifier under realistic\nassumptions. Furthermore, we adapt an adversarial input detector into a mixing\nnetwork that adaptively adjusts the mixture of the two base models, further\nreducing the accuracy penalty of achieving robustness. The proposed flexible\nmethod, termed \"adaptive smoothing\", can work in conjunction with existing or\neven future methods that improve clean accuracy, robustness, or adversary\ndetection. Our empirical evaluation considers strong attack methods, including\nAutoAttack and adaptive attack. On the CIFAR-100 dataset, our method achieves\nan 85.21% clean accuracy while maintaining a 38.72% $\\ell_\\infty$-AutoAttacked\n($\\epsilon = 8/255$) accuracy, becoming the second most robust method on the\nRobustBench CIFAR-100 benchmark as of submission, while improving the clean\naccuracy by ten percentage points compared with all listed models. The code\nthat implements our method is available at\nhttps://github.com/Bai-YT/AdaptiveSmoothing.\n","authors":["Yatong Bai","Brendon G. Anderson","Aerin Kim","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2301.12554v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06136v3","updated":"2024-04-09T02:38:16Z","published":"2024-02-09T01:48:44Z","title":"SIR: Multi-view Inverse Rendering with Decomposable Shadow for Indoor\n Scenes","summary":" We propose SIR, an efficient method to decompose differentiable shadows for\ninverse rendering on indoor scenes using multi-view data, addressing the\nchallenges in accurately decomposing the materials and lighting conditions.\nUnlike previous methods that struggle with shadow fidelity in complex lighting\nenvironments, our approach explicitly learns shadows for enhanced realism in\nmaterial estimation under unknown light positions. Utilizing posed HDR images\nas input, SIR employs an SDF-based neural radiance field for comprehensive\nscene representation. Then, SIR integrates a shadow term with a three-stage\nmaterial estimation approach to improve SVBRDF quality. Specifically, SIR is\ndesigned to learn a differentiable shadow, complemented by BRDF regularization,\nto optimize inverse rendering accuracy. Extensive experiments on both synthetic\nand real-world indoor scenes demonstrate the superior performance of SIR over\nexisting methods in both quantitative metrics and qualitative analysis. The\nsignificant decomposing ability of SIR enables sophisticated editing\ncapabilities like free-view relighting, object insertion, and material\nreplacement. The code and data are available at\nhttps://xiaokangwei.github.io/SIR/.\n","authors":["Xiaokang Wei","Zhuoman Liu","Yan Luximon"],"pdf_url":"https://arxiv.org/pdf/2402.06136v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15033v2","updated":"2024-04-09T02:29:32Z","published":"2024-03-22T08:32:30Z","title":"Toward Tiny and High-quality Facial Makeup with Data Amplify Learning","summary":" Contemporary makeup approaches primarily hinge on unpaired learning\nparadigms, yet they grapple with the challenges of inaccurate supervision\n(e.g., face misalignment) and sophisticated facial prompts (including face\nparsing, and landmark detection). These challenges prohibit low-cost deployment\nof facial makeup models, especially on mobile devices. To solve above problems,\nwe propose a brand-new learning paradigm, termed \"Data Amplify Learning (DAL),\"\nalongside a compact makeup model named \"TinyBeauty.\" The core idea of DAL lies\nin employing a Diffusion-based Data Amplifier (DDA) to \"amplify\" limited images\nfor the model training, thereby enabling accurate pixel-to-pixel supervision\nwith merely a handful of annotations. Two pivotal innovations in DDA facilitate\nthe above training approach: (1) A Residual Diffusion Model (RDM) is designed\nto generate high-fidelity detail and circumvent the detail vanishing problem in\nthe vanilla diffusion models; (2) A Fine-Grained Makeup Module (FGMM) is\nproposed to achieve precise makeup control and combination while retaining face\nidentity. Coupled with DAL, TinyBeauty necessitates merely 80K parameters to\nachieve a state-of-the-art performance without intricate face prompts.\nMeanwhile, TinyBeauty achieves a remarkable inference speed of up to 460 fps on\nthe iPhone 13. Extensive experiments show that DAL can produce highly\ncompetitive makeup models using only 5 image pairs.\n","authors":["Qiaoqiao Jin","Xuanhong Chen","Meiguang Jin","Ying Chen","Rui Shi","Yucheng Zheng","Yupeng Zhu","Bingbing Ni"],"pdf_url":"https://arxiv.org/pdf/2403.15033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03662v2","updated":"2024-04-09T01:43:11Z","published":"2024-03-06T12:31:02Z","title":"Harnessing Meta-Learning for Improving Full-Frame Video Stabilization","summary":" Video stabilization is a longstanding computer vision problem, particularly\npixel-level synthesis solutions for video stabilization which synthesize full\nframes add to the complexity of this task. These techniques aim to stabilize\nvideos by synthesizing full frames while enhancing the stability of the\nconsidered video. This intensifies the complexity of the task due to the\ndistinct mix of unique motion profiles and visual content present in each video\nsequence, making robust generalization with fixed parameters difficult. In our\nstudy, we introduce a novel approach to enhance the performance of pixel-level\nsynthesis solutions for video stabilization by adapting these models to\nindividual input video sequences. The proposed adaptation exploits low-level\nvisual cues accessible during test-time to improve both the stability and\nquality of resulting videos. We highlight the efficacy of our methodology of\n\"test-time adaptation\" through simple fine-tuning of one of these models,\nfollowed by significant stability gain via the integration of meta-learning\ntechniques. Notably, significant improvement is achieved with only a single\nadaptation step. The versatility of the proposed algorithm is demonstrated by\nconsistently improving the performance of various pixel-level synthesis models\nfor video stabilization in real-world scenarios.\n","authors":["Muhammad Kashif Ali","Eun Woo Im","Dongjin Kim","Tae Hyun Kim"],"pdf_url":"https://arxiv.org/pdf/2403.03662v2.pdf","comment":"CVPR 2024, Code will be made availble on:\n http://github.com/MKashifAli/MetaVideoStab"},{"id":"http://arxiv.org/abs/2309.13475v3","updated":"2024-04-09T01:26:58Z","published":"2023-09-23T20:33:38Z","title":"Detecting and Mitigating System-Level Anomalies of Vision-Based\n Controllers","summary":" Autonomous systems, such as self-driving cars and drones, have made\nsignificant strides in recent years by leveraging visual inputs and machine\nlearning for decision-making and control. Despite their impressive performance,\nthese vision-based controllers can make erroneous predictions when faced with\nnovel or out-of-distribution inputs. Such errors can cascade to catastrophic\nsystem failures and compromise system safety. In this work, we introduce a\nrun-time anomaly monitor to detect and mitigate such closed-loop, system-level\nfailures. Specifically, we leverage a reachability-based framework to\nstress-test the vision-based controller offline and mine its system-level\nfailures. This data is then used to train a classifier that is leveraged online\nto flag inputs that might cause system breakdowns. The anomaly detector\nhighlights issues that transcend individual modules and pertain to the safety\nof the overall system. We also design a fallback controller that robustly\nhandles these detected anomalies to preserve system safety. We validate the\nproposed approach on an autonomous aircraft taxiing system that uses a\nvision-based controller for taxiing. Our results show the efficacy of the\nproposed approach in identifying and handling system-level anomalies,\noutperforming methods such as prediction error-based detection, and ensembling,\nthereby enhancing the overall safety and robustness of autonomous systems.\n","authors":["Aryaman Gupta","Kaustav Chakraborty","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2309.13475v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10240v2","updated":"2024-04-09T01:16:07Z","published":"2023-12-15T22:18:38Z","title":"Rich Human Feedback for Text-to-Image Generation","summary":" Recent Text-to-Image (T2I) generation models such as Stable Diffusion and\nImagen have made significant progress in generating high-resolution images\nbased on text descriptions. However, many generated images still suffer from\nissues such as artifacts/implausibility, misalignment with text descriptions,\nand low aesthetic quality. Inspired by the success of Reinforcement Learning\nwith Human Feedback (RLHF) for large language models, prior works collected\nhuman-provided scores as feedback on generated images and trained a reward\nmodel to improve the T2I generation. In this paper, we enrich the feedback\nsignal by (i) marking image regions that are implausible or misaligned with the\ntext, and (ii) annotating which words in the text prompt are misrepresented or\nmissing on the image. We collect such rich human feedback on 18K generated\nimages (RichHF-18K) and train a multimodal transformer to predict the rich\nfeedback automatically. We show that the predicted rich human feedback can be\nleveraged to improve image generation, for example, by selecting high-quality\ntraining data to finetune and improve the generative models, or by creating\nmasks with predicted heatmaps to inpaint the problematic regions. Notably, the\nimprovements generalize to models (Muse) beyond those used to generate the\nimages on which human feedback data were collected (Stable Diffusion variants).\nThe RichHF-18K data set will be released in our GitHub repository:\nhttps://github.com/google-research/google-research/tree/master/richhf_18k.\n","authors":["Youwei Liang","Junfeng He","Gang Li","Peizhao Li","Arseniy Klimovskiy","Nicholas Carolan","Jiao Sun","Jordi Pont-Tuset","Sarah Young","Feng Yang","Junjie Ke","Krishnamurthy Dj Dvijotham","Katie Collins","Yiwen Luo","Yang Li","Kai J Kohlhoff","Deepak Ramachandran","Vidhya Navalpakkam"],"pdf_url":"https://arxiv.org/pdf/2312.10240v2.pdf","comment":"CVPR'24"},{"id":"http://arxiv.org/abs/2402.17228v3","updated":"2024-04-09T01:10:15Z","published":"2024-02-27T05:42:38Z","title":"Feature Re-Embedding: Towards Foundation Model-Level Performance in\n Computational Pathology","summary":" Multiple instance learning (MIL) is the most widely used framework in\ncomputational pathology, encompassing sub-typing, diagnosis, prognosis, and\nmore. However, the existing MIL paradigm typically requires an offline instance\nfeature extractor, such as a pre-trained ResNet or a foundation model. This\napproach lacks the capability for feature fine-tuning within the specific\ndownstream tasks, limiting its adaptability and performance. To address this\nissue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding\nthe instance features online, which captures fine-grained local features and\nestablishes connections across different regions. Unlike existing works that\nfocus on pre-training powerful feature extractor or designing sophisticated\ninstance aggregator, R$^2$T is tailored to re-embed instance features online.\nIt serves as a portable module that can seamlessly integrate into mainstream\nMIL models. Extensive experimental results on common computational pathology\ntasks validate that: 1) feature re-embedding improves the performance of MIL\nmodels based on ResNet-50 features to the level of foundation model features,\nand further enhances the performance of foundation model features; 2) the\nR$^2$T can introduce more significant performance improvements to various MIL\nmodels; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest\nmethods by a large margin.The code is available at:\nhttps://github.com/DearCaat/RRT-MIL.\n","authors":["Wenhao Tang","Fengtao Zhou","Sheng Huang","Xiang Zhu","Yi Zhang","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2402.17228v3.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2308.13072v2","updated":"2024-04-09T01:09:41Z","published":"2023-08-24T20:29:09Z","title":"Full-dose Whole-body PET Synthesis from Low-dose PET Using\n High-efficiency Denoising Diffusion Probabilistic Model: PET Consistency\n Model","summary":" Objective: Positron Emission Tomography (PET) has been a commonly used\nimaging modality in broad clinical applications. One of the most important\ntradeoffs in PET imaging is between image quality and radiation dose: high\nimage quality comes with high radiation exposure. Improving image quality is\ndesirable for all clinical applications while minimizing radiation exposure is\nneeded to reduce risk to patients. Approach: We introduce PET Consistency Model\n(PET-CM), an efficient diffusion-based method for generating high-quality\nfull-dose PET images from low-dose PET images. It employs a two-step process,\nadding Gaussian noise to full-dose PET images in the forward diffusion, and\nthen denoising them using a PET Shifted-window Vision Transformer (PET-VIT)\nnetwork in the reverse diffusion. The PET-VIT network learns a consistency\nfunction that enables direct denoising of Gaussian noise into clean full-dose\nPET images. PET-CM achieves state-of-the-art image quality while requiring\nsignificantly less computation time than other methods. Results: In experiments\ncomparing eighth-dose to full-dose images, PET-CM demonstrated impressive\nperformance with NMAE of 1.278+/-0.122%, PSNR of 33.783+/-0.824dB, SSIM of\n0.964+/-0.009, NCC of 0.968+/-0.011, HRS of 4.543, and SUV Error of\n0.255+/-0.318%, with an average generation time of 62 seconds per patient. This\nis a significant improvement compared to the state-of-the-art diffusion-based\nmodel with PET-CM reaching this result 12x faster. Similarly, in the\nquarter-dose to full-dose image experiments, PET-CM delivered competitive\noutcomes, achieving an NMAE of 0.973+/-0.066%, PSNR of 36.172+/-0.801dB, SSIM\nof 0.984+/-0.004, NCC of 0.990+/-0.005, HRS of 4.428, and SUV Error of\n0.151+/-0.192% using the same generation process, which underlining its high\nquantitative and clinical precision in both denoising scenario.\n","authors":["Shaoyan Pan","Elham Abouei","Junbo Peng","Joshua Qian","Jacob F Wynne","Tonghe Wang","Chih-Wei Chang","Justin Roper","Jonathon A Nye","Hui Mao","Xiaofeng Yang"],"pdf_url":"https://arxiv.org/pdf/2308.13072v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05916v1","updated":"2024-04-09T00:30:16Z","published":"2024-04-09T00:30:16Z","title":"Prompt-driven Universal Model for View-Agnostic Echocardiography\n Analysis","summary":" Echocardiography segmentation for cardiac analysis is time-consuming and\nresource-intensive due to the variability in image quality and the necessity to\nprocess scans from various standard views. While current automated segmentation\nmethods in echocardiography show promising performance, they are trained on\nspecific scan views to analyze corresponding data. However, this solution has a\nlimitation as the number of required models increases with the number of\nstandard views. To address this, in this paper, we present a prompt-driven\nuniversal method for view-agnostic echocardiography analysis. Considering the\ndomain shift between standard views, we first introduce a method called prompt\nmatching, aimed at learning prompts specific to different views by matching\nprompts and querying input embeddings using a pre-trained vision model. Then,\nwe utilized a pre-trained medical language model to align textual information\nwith pixel data for accurate segmentation. Extensive experiments on three\nstandard views showed that our approach significantly outperforms the\nstate-of-the-art universal methods and achieves comparable or even better\nperformances over the segmentation model trained and tested on same views.\n","authors":["Sekeun Kim","Hui Ren","Peng Guo","Abder-Rahman Ali","Patrick Zhang","Kyungsang Kim","Xiang Li","Quanzheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.05916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05911v1","updated":"2024-04-09T00:05:45Z","published":"2024-04-09T00:05:45Z","title":"LATUP-Net: A Lightweight 3D Attention U-Net with Parallel Convolutions\n for Brain Tumor Segmentation","summary":" Early-stage 3D brain tumor segmentation from magnetic resonance imaging (MRI)\nscans is crucial for prompt and effective treatment. However, this process\nfaces the challenge of precise delineation due to the tumors' complex\nheterogeneity. Moreover, energy sustainability targets and resource\nlimitations, especially in developing countries, require efficient and\naccessible medical imaging solutions. The proposed architecture, a Lightweight\n3D ATtention U-Net with Parallel convolutions, LATUP-Net, addresses these\nissues. It is specifically designed to reduce computational requirements\nsignificantly while maintaining high segmentation performance. By incorporating\nparallel convolutions, it enhances feature representation by capturing\nmulti-scale information. It further integrates an attention mechanism to refine\nsegmentation through selective feature recalibration. LATUP-Net achieves\npromising segmentation performance: the average Dice scores for the whole\ntumor, tumor core, and enhancing tumor on the BraTS2020 dataset are 88.41%,\n83.82%, and 73.67%, and on the BraTS2021 dataset, they are 90.29%, 89.54%, and\n83.92%, respectively. Hausdorff distance metrics further indicate its improved\nability to delineate tumor boundaries. With its significantly reduced\ncomputational demand using only 3.07 M parameters, about 59 times fewer than\nother state-of-the-art models, and running on a single V100 GPU, LATUP-Net\nstands out as a promising solution for real-world clinical applications,\nparticularly in settings with limited resources. Investigations into the\nmodel's interpretability, utilizing gradient-weighted class activation mapping\nand confusion matrices, reveal that while attention mechanisms enhance the\nsegmentation of small regions, their impact is nuanced. Achieving the most\naccurate tumor delineation requires carefully balancing local and global\nfeatures.\n","authors":["Ebtihal J. Alwadee","Xianfang Sun","Yipeng Qin","Frank C. Langbein"],"pdf_url":"https://arxiv.org/pdf/2404.05911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06657v1","updated":"2024-04-09T23:47:53Z","published":"2024-04-09T23:47:53Z","title":"Res-U2Net: Untrained Deep Learning for Phase Retrieval and Image\n Reconstruction","summary":" Conventional deep learning-based image reconstruction methods require a large\namount of training data which can be hard to obtain in practice. Untrained deep\nlearning methods overcome this limitation by training a network to invert a\nphysical model of the image formation process. Here we present a novel\nuntrained Res-U2Net model for phase retrieval. We use the extracted phase\ninformation to determine changes in an object's surface and generate a mesh\nrepresentation of its 3D structure. We compare the performance of Res-U2Net\nphase retrieval against UNet and U2Net using images from the GDXRAY dataset.\n","authors":["Carlos Osorio Quero","Daniel Leykam","Irving Rondon Ojeda"],"pdf_url":"https://arxiv.org/pdf/2404.06657v1.pdf","comment":"16 pages, 8 figures, 4 Tables"},{"id":"http://arxiv.org/abs/2312.00825v2","updated":"2024-04-09T23:28:49Z","published":"2023-11-30T18:32:14Z","title":"SocialCounterfactuals: Probing and Mitigating Intersectional Social\n Biases in Vision-Language Models with Counterfactual Examples","summary":" While vision-language models (VLMs) have achieved remarkable performance\nimprovements recently, there is growing evidence that these models also posses\nharmful biases with respect to social attributes such as gender and race. Prior\nstudies have primarily focused on probing such bias attributes individually\nwhile ignoring biases associated with intersections between social attributes.\nThis could be due to the difficulty of collecting an exhaustive set of\nimage-text pairs for various combinations of social attributes. To address this\nchallenge, we employ text-to-image diffusion models to produce counterfactual\nexamples for probing intersectional social biases at scale. Our approach\nutilizes Stable Diffusion with cross attention control to produce sets of\ncounterfactual image-text pairs that are highly similar in their depiction of a\nsubject (e.g., a given occupation) while differing only in their depiction of\nintersectional social attributes (e.g., race & gender). Through our\nover-generate-then-filter methodology, we produce SocialCounterfactuals, a\nhigh-quality dataset containing 171k image-text pairs for probing\nintersectional biases related to gender, race, and physical characteristics. We\nconduct extensive experiments to demonstrate the usefulness of our generated\ndataset for probing and mitigating intersectional social biases in\nstate-of-the-art VLMs.\n","authors":["Phillip Howard","Avinash Madasu","Tiep Le","Gustavo Lujan Moreno","Anahita Bhiwandiwalla","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2312.00825v2.pdf","comment":"Accepted to CVPR 2024. arXiv admin note: text overlap with\n arXiv:2310.02988"},{"id":"http://arxiv.org/abs/2404.06653v1","updated":"2024-04-09T23:24:19Z","published":"2024-04-09T23:24:19Z","title":"FlameFinder: Illuminating Obscured Fire through Smoke with Attentive\n Deep Metric Learning","summary":" FlameFinder is a deep metric learning (DML) framework designed to accurately\ndetect flames, even when obscured by smoke, using thermal images from\nfirefighter drones during wildfire monitoring. Traditional RGB cameras struggle\nin such conditions, but thermal cameras can capture smoke-obscured flame\nfeatures. However, they lack absolute thermal reference points, leading to\nfalse positives.To address this issue, FlameFinder utilizes paired thermal-RGB\nimages for training. By learning latent flame features from smoke-free samples,\nthe model becomes less biased towards relative thermal gradients. In testing,\nit identifies flames in smoky patches by analyzing their equivalent\nthermal-domain distribution. This method improves performance using both\nsupervised and distance-based clustering metrics.The framework incorporates a\nflame segmentation method and a DML-aided detection framework. This includes\nutilizing center loss (CL), triplet center loss (TCL), and triplet cosine\ncenter loss (TCCL) to identify optimal cluster representatives for\nclassification. However, the dominance of center loss over the other losses\nleads to the model missing features sensitive to them. To address this\nlimitation, an attention mechanism is proposed. This mechanism allows for\nnon-uniform feature contribution, amplifying the critical role of cosine and\ntriplet loss in the DML framework. Additionally, it improves interpretability,\nclass discrimination, and decreases intra-class variance. As a result, the\nproposed model surpasses the baseline by 4.4% in the FLAME2 dataset and 7% in\nthe FLAME3 dataset for unobscured flame detection accuracy. Moreover, it\ndemonstrates enhanced class separation in obscured scenarios compared to VGG19,\nResNet18, and three backbone models tailored for flame detection.\n","authors":["Hossein Rajoli","Sahand Khoshdel","Fatemeh Afghah","Xiaolong Ma"],"pdf_url":"https://arxiv.org/pdf/2404.06653v1.pdf","comment":"Submitted as a Journal Paper to IEEE Transactions on Geoscience and\n Remote Sensing"},{"id":"http://arxiv.org/abs/2404.05139v2","updated":"2024-04-09T23:17:07Z","published":"2024-04-08T01:38:43Z","title":"Better Monocular 3D Detectors with LiDAR from the Past","summary":" Accurate 3D object detection is crucial to autonomous driving. Though\nLiDAR-based detectors have achieved impressive performance, the high cost of\nLiDAR sensors precludes their widespread adoption in affordable vehicles.\nCamera-based detectors are cheaper alternatives but often suffer inferior\nperformance compared to their LiDAR-based counterparts due to inherent depth\nambiguities in images. In this work, we seek to improve monocular 3D detectors\nby leveraging unlabeled historical LiDAR data. Specifically, at inference time,\nwe assume that the camera-based detectors have access to multiple unlabeled\nLiDAR scans from past traversals at locations of interest (potentially from\nother high-end vehicles equipped with LiDAR sensors). Under this setup, we\nproposed a novel, simple, and end-to-end trainable framework, termed\nAsyncDepth, to effectively extract relevant features from asynchronous LiDAR\ntraversals of the same location for monocular 3D detectors. We show consistent\nand significant performance gain (up to 9 AP) across multiple state-of-the-art\nmodels and datasets with a negligible additional latency of 9.66 ms and a small\nstorage cost.\n","authors":["Yurong You","Cheng Perng Phoo","Carlos Andres Diaz-Ruiz","Katie Z Luo","Wei-Lun Chao","Mark Campbell","Bharath Hariharan","Kilian Q Weinberger"],"pdf_url":"https://arxiv.org/pdf/2404.05139v2.pdf","comment":"Accepted by ICRA 2024. The code can be found at\n https://github.com/YurongYou/AsyncDepth"},{"id":"http://arxiv.org/abs/2404.06638v1","updated":"2024-04-09T22:17:20Z","published":"2024-04-09T22:17:20Z","title":"SAM-I-Am: Semantic Boosting for Zero-shot Atomic-Scale Electron\n Micrograph Segmentation","summary":" Image segmentation is a critical enabler for tasks ranging from medical\ndiagnostics to autonomous driving. However, the correct segmentation semantics\n- where are boundaries located? what segments are logically similar? - change\ndepending on the domain, such that state-of-the-art foundation models can\ngenerate meaningless and incorrect results. Moreover, in certain domains,\nfine-tuning and retraining techniques are infeasible: obtaining labels is\ncostly and time-consuming; domain images (micrographs) can be exponentially\ndiverse; and data sharing (for third-party retraining) is restricted. To enable\nrapid adaptation of the best segmentation technology, we propose the concept of\nsemantic boosting: given a zero-shot foundation model, guide its segmentation\nand adjust results to match domain expectations. We apply semantic boosting to\nthe Segment Anything Model (SAM) to obtain microstructure segmentation for\ntransmission electron microscopy. Our booster, SAM-I-Am, extracts geometric and\ntextural features of various intermediate masks to perform mask removal and\nmask merging operations. We demonstrate a zero-shot performance increase of\n(absolute) +21.35%, +12.6%, +5.27% in mean IoU, and a -9.91%, -18.42%, -4.06%\ndrop in mean false positive masks across images of three difficulty classes\nover vanilla SAM (ViT-L).\n","authors":["Waqwoya Abebe","Jan Strube","Luanzheng Guo","Nathan R. Tallent","Oceane Bel","Steven Spurgeon","Christina Doty","Ali Jannesari"],"pdf_url":"https://arxiv.org/pdf/2404.06638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06637v1","updated":"2024-04-09T22:16:34Z","published":"2024-04-09T22:16:34Z","title":"GeoSynth: Contextually-Aware High-Resolution Satellite Image Synthesis","summary":" We present GeoSynth, a model for synthesizing satellite images with global\nstyle and image-driven layout control. The global style control is via textual\nprompts or geographic location. These enable the specification of scene\nsemantics or regional appearance respectively, and can be used together. We\ntrain our model on a large dataset of paired satellite imagery, with\nautomatically generated captions, and OpenStreetMap data. We evaluate various\ncombinations of control inputs, including different types of layout controls.\nResults demonstrate that our model can generate diverse, high-quality images\nand exhibits excellent zero-shot generalization. The code and model checkpoints\nare available at https://github.com/mvrl/GeoSynth.\n","authors":["Srikumar Sastry","Subash Khanal","Aayush Dhakal","Nathan Jacobs"],"pdf_url":"https://arxiv.org/pdf/2404.06637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05195v2","updated":"2024-04-09T22:14:37Z","published":"2024-02-07T19:07:10Z","title":"$λ$-ECLIPSE: Multi-Concept Personalized Text-to-Image Diffusion\n Models by Leveraging CLIP Latent Space","summary":" Despite the recent advances in personalized text-to-image (P-T2I) generative\nmodels, it remains challenging to perform finetuning-free multi-subject-driven\nT2I in a resource-efficient manner. Predominantly, contemporary approaches,\ninvolving the training of Hypernetworks and Multimodal Large Language Models\n(MLLMs), require heavy computing resources that range from 600 to 12300 GPU\nhours of training. These subject-driven T2I methods hinge on Latent Diffusion\nModels (LDMs), which facilitate T2I mapping through cross-attention layers.\nWhile LDMs offer distinct advantages, P-T2I methods' reliance on the latent\nspace of these diffusion models significantly escalates resource demands,\nleading to inconsistent results and necessitating numerous iterations for a\nsingle desired image. In this paper, we present $\\lambda$-ECLIPSE, an\nalternative prior-training strategy that works in the latent space of a\npre-trained CLIP model without relying on the diffusion UNet models.\n$\\lambda$-ECLIPSE leverages the image-text interleaved pre-training for fast\nand effective multi-subject-driven P-T2I. Through extensive experiments, we\nestablish that $\\lambda$-ECLIPSE surpasses existing baselines in composition\nalignment while preserving concept alignment performance, even with\nsignificantly lower resource utilization. $\\lambda$-ECLIPSE performs\nmulti-subject driven P-T2I with just 34M parameters and is trained on a mere 74\nGPU hours. Additionally, $\\lambda$-ECLIPSE demonstrates the unique ability to\nperform multi-concept interpolations.\n","authors":["Maitreya Patel","Sangmin Jung","Chitta Baral","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2402.05195v2.pdf","comment":"Project page: https://eclipse-t2i.github.io/Lambda-ECLIPSE/"},{"id":"http://arxiv.org/abs/2312.04746v2","updated":"2024-04-09T21:48:42Z","published":"2023-12-07T23:16:37Z","title":"Quilt-LLaVA: Visual Instruction Tuning by Extracting Localized\n Narratives from Open-Source Histopathology Videos","summary":" Diagnosis in histopathology requires a global whole slide images (WSIs)\nanalysis, requiring pathologists to compound evidence from different WSI\npatches. The gigapixel scale of WSIs poses a challenge for histopathology\nmulti-modal models. Training multi-model models for histopathology requires\ninstruction tuning datasets, which currently contain information for individual\nimage patches, without a spatial grounding of the concepts within each patch\nand without a wider view of the WSI. Therefore, they lack sufficient diagnostic\ncapacity for histopathology. To bridge this gap, we introduce Quilt-Instruct, a\nlarge-scale dataset of 107,131 histopathology-specific instruction\nquestion/answer pairs, grounded within diagnostically relevant image patches\nthat make up the WSI. Our dataset is collected by leveraging educational\nhistopathology videos from YouTube, which provides spatial localization of\nnarrations by automatically extracting the narrators' cursor positions.\nQuilt-Instruct supports contextual reasoning by extracting diagnosis and\nsupporting facts from the entire WSI. Using Quilt-Instruct, we train\nQuilt-LLaVA, which can reason beyond the given single image patch, enabling\ndiagnostic reasoning across patches. To evaluate Quilt-LLaVA, we propose a\ncomprehensive evaluation dataset created from 985 images and 1283\nhuman-generated question-answers. We also thoroughly evaluate Quilt-LLaVA using\npublic histopathology datasets, where Quilt-LLaVA significantly outperforms\nSOTA by over 10% on relative GPT-4 score and 4% and 9% on open and closed set\nVQA. Our code, data, and model are publicly accessible at\nquilt-llava.github.io.\n","authors":["Mehmet Saygin Seyfioglu","Wisdom O. Ikezogwo","Fatemeh Ghezloo","Ranjay Krishna","Linda Shapiro"],"pdf_url":"https://arxiv.org/pdf/2312.04746v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06622v1","updated":"2024-04-09T21:12:31Z","published":"2024-04-09T21:12:31Z","title":"Calibrating Higher-Order Statistics for Few-Shot Class-Incremental\n Learning with Pre-trained Vision Transformers","summary":" Few-shot class-incremental learning (FSCIL) aims to adapt the model to new\nclasses from very few data (5 samples) without forgetting the previously\nlearned classes. Recent works in many-shot CIL (MSCIL) (using all available\ntraining data) exploited pre-trained models to reduce forgetting and achieve\nbetter plasticity. In a similar fashion, we use ViT models pre-trained on\nlarge-scale datasets for few-shot settings, which face the critical issue of\nlow plasticity. FSCIL methods start with a many-shot first task to learn a very\ngood feature extractor and then move to the few-shot setting from the second\ntask onwards. While the focus of most recent studies is on how to learn the\nmany-shot first task so that the model generalizes to all future few-shot\ntasks, we explore in this work how to better model the few-shot data using\npre-trained models, irrespective of how the first task is trained. Inspired by\nrecent works in MSCIL, we explore how using higher-order feature statistics can\ninfluence the classification of few-shot classes. We identify the main\nchallenge of obtaining a good covariance matrix from few-shot data and propose\nto calibrate the covariance matrix for new classes based on semantic similarity\nto the many-shot base classes. Using the calibrated feature statistics in\ncombination with existing methods significantly improves few-shot continual\nclassification on several FSCIL benchmarks. Code is available at\nhttps://github.com/dipamgoswami/FSCIL-Calibration.\n","authors":["Dipam Goswami","Bartłomiej Twardowski","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2404.06622v1.pdf","comment":"Accepted at CLVision workshop (CVPR 2024)"},{"id":"http://arxiv.org/abs/2403.08092v2","updated":"2024-04-09T20:55:01Z","published":"2024-03-12T22:03:19Z","title":"Mitigating the Impact of Attribute Editing on Face Recognition","summary":" Through a large-scale study over diverse face images, we show that facial\nattribute editing using modern generative AI models can severely degrade\nautomated face recognition systems. This degradation persists even with\nidentity-preserving generative models. To mitigate this issue, we propose two\nnovel techniques for local and global attribute editing. We empirically ablate\ntwenty-six facial semantic, demographic and expression-based attributes that\nhave been edited using state-of-the-art generative models, and evaluate them\nusing ArcFace and AdaFace matchers on CelebA, CelebAMaskHQ and LFW datasets.\nFinally, we use LLaVA, an emerging visual question-answering framework for\nattribute prediction to validate our editing techniques. Our methods outperform\nthe current state-of-the-art at facial editing (BLIP, InstantID) while\nimproving identity retention by a significant extent.\n","authors":["Sudipta Banerjee","Sai Pranaswi Mullangi","Shruti Wagle","Chinmay Hegde","Nasir Memon"],"pdf_url":"https://arxiv.org/pdf/2403.08092v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.06605v1","updated":"2024-04-09T20:24:29Z","published":"2024-04-09T20:24:29Z","title":"RoadBEV: Road Surface Reconstruction in Bird's Eye View","summary":" Road surface conditions, especially geometry profiles, enormously affect\ndriving performance of autonomous vehicles. Vision-based online road\nreconstruction promisingly captures road information in advance. Existing\nsolutions like monocular depth estimation and stereo matching suffer from\nmodest performance. The recent technique of Bird's-Eye-View (BEV) perception\nprovides immense potential to more reliable and accurate reconstruction. This\npaper uniformly proposes two simple yet effective models for road elevation\nreconstruction in BEV named RoadBEV-mono and RoadBEV-stereo, which estimate\nroad elevation with monocular and stereo images, respectively. The former\ndirectly fits elevation values based on voxel features queried from image view,\nwhile the latter efficiently recognizes road elevation patterns based on BEV\nvolume representing discrepancy between left and right voxel features.\nInsightful analyses reveal their consistence and difference with perspective\nview. Experiments on real-world dataset verify the models' effectiveness and\nsuperiority. Elevation errors of RoadBEV-mono and RoadBEV-stereo achieve 1.83cm\nand 0.56cm, respectively. The estimation performance improves by 50\\% in BEV\nbased on monocular image. Our models are promising for practical applications,\nproviding valuable references for vision-based BEV perception in autonomous\ndriving. The code is released at https://github.com/ztsrxh/RoadBEV.\n","authors":["Tong Zhao","Lei Yang","Yichen Xie","Mingyu Ding","Masayoshi Tomizuka","Yintao Wei"],"pdf_url":"https://arxiv.org/pdf/2404.06605v1.pdf","comment":"Dataset page: https://thu-rsxd.com/rsrd Code:\n https://github.com/ztsrxh/RoadBEV"},{"id":"http://arxiv.org/abs/2404.06593v1","updated":"2024-04-09T19:49:01Z","published":"2024-04-09T19:49:01Z","title":"Spatially Optimized Compact Deep Metric Learning Model for Similarity\n Search","summary":" Spatial optimization is often overlooked in many computer vision tasks.\nFilters should be able to recognize the features of an object regardless of\nwhere it is in the image. Similarity search is a crucial task where spatial\nfeatures decide an important output. The capacity of convolution to capture\nvisual patterns across various locations is limited. In contrast to\nconvolution, the involution kernel is dynamically created at each pixel based\non the pixel value and parameters that have been learned. This study\ndemonstrates that utilizing a single layer of involution feature extractor\nalongside a compact convolution model significantly enhances the performance of\nsimilarity search. Additionally, we improve predictions by using the GELU\nactivation function rather than the ReLU. The negligible amount of weight\nparameters in involution with a compact model with better performance makes the\nmodel very useful in real-world implementations. Our proposed model is below 1\nmegabyte in size. We have experimented with our proposed methodology and other\nmodels on CIFAR-10, FashionMNIST, and MNIST datasets. Our proposed method\noutperforms across all three datasets.\n","authors":["Md. Farhadul Islam","Md. Tanzim Reza","Meem Arafat Manab","Mohammad Rakibul Hasan Mahin","Sarah Zabeen","Jannatun Noor"],"pdf_url":"https://arxiv.org/pdf/2404.06593v1.pdf","comment":"5 pages, 3 figures,"},{"id":"http://arxiv.org/abs/2404.06589v1","updated":"2024-04-09T19:33:05Z","published":"2024-04-09T19:33:05Z","title":"Leveraging Latents for Efficient Thermography Classification and\n Segmentation","summary":" Breast cancer is a prominent health concern worldwide, currently being the\nsecondmost common and second-deadliest type of cancer in women. While current\nbreast cancer diagnosis mainly relies on mammography imaging, in recent years\nthe use of thermography for breast cancer imaging has been garnering growing\npopularity. Thermographic imaging relies on infrared cameras to capture\nbody-emitted heat distributions. While these heat signatures have proven useful\nfor computer-vision systems for accurate breast cancer segmentation and\nclassification, prior work often relies on handcrafted feature engineering or\ncomplex architectures, potentially limiting the comparability and applicability\nof these methods. In this work, we present a novel algorithm for both breast\ncancer classification and segmentation. Rather than focusing efforts on manual\nfeature and architecture engineering, our algorithm focuses on leveraging an\ninformative, learned feature space, thus making our solution simpler to use and\nextend to other frameworks and downstream tasks, as well as more applicable to\ndata-scarce settings. Our classification produces SOTA results, while we are\nthe first work to produce segmentation regions studied in this paper.\n","authors":["Tamir Shor","Chaim Baskin","Alex Bronstein"],"pdf_url":"https://arxiv.org/pdf/2404.06589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01102v2","updated":"2024-04-09T19:26:36Z","published":"2024-04-01T13:23:04Z","title":"Diffusion based Zero-shot Medical Image-to-Image Translation for Cross\n Modality Segmentation","summary":" Cross-modality image segmentation aims to segment the target modalities using\na method designed in the source modality. Deep generative models can translate\nthe target modality images into the source modality, thus enabling\ncross-modality segmentation. However, a vast body of existing cross-modality\nimage translation methods relies on supervised learning. In this work, we aim\nto address the challenge of zero-shot learning-based image translation tasks\n(extreme scenarios in the target modality is unseen in the training phase). To\nleverage generative learning for zero-shot cross-modality image segmentation,\nwe propose a novel unsupervised image translation method. The framework learns\nto translate the unseen source image to the target modality for image\nsegmentation by leveraging the inherent statistical consistency between\ndifferent modalities for diffusion guidance. Our framework captures identical\ncross-modality features in the statistical domain, offering diffusion guidance\nwithout relying on direct mappings between the source and target domains. This\nadvantage allows our method to adapt to changing source domains without the\nneed for retraining, making it highly practical when sufficient labeled source\ndomain data is not available. The proposed framework is validated in zero-shot\ncross-modality image segmentation tasks through empirical comparisons with\ninfluential generative models, including adversarial-based and diffusion-based\nmodels.\n","authors":["Zihao Wang","Yingyu Yang","Yuzhou Chen","Tingting Yuan","Maxime Sermesant","Herve Delingette","Ona Wu"],"pdf_url":"https://arxiv.org/pdf/2404.01102v2.pdf","comment":"Neurips 2023 Diffusion Workshop"},{"id":"http://arxiv.org/abs/2212.05140v2","updated":"2024-04-09T19:17:07Z","published":"2022-12-09T22:53:40Z","title":"Local Neighborhood Features for 3D Classification","summary":" With advances in deep learning model training strategies, the training of\nPoint cloud classification methods is significantly improving. For example,\nPointNeXt, which adopts prominent training techniques and InvResNet layers into\nPointNet++, achieves over 7% improvement on the real-world ScanObjectNN\ndataset. However, most of these models use point coordinates features of\nneighborhood points mapped to higher dimensional space while ignoring the\nneighborhood point features computed before feeding to the network layers. In\nthis paper, we revisit the PointNeXt model to study the usage and benefit of\nsuch neighborhood point features. We train and evaluate PointNeXt on ModelNet40\n(synthetic), ScanObjectNN (real-world), and a recent large-scale, real-world\ngrocery dataset, i.e., 3DGrocery100. In addition, we provide an additional\ninference strategy of weight averaging the top two checkpoints of PointNeXt to\nimprove classification accuracy. Together with the abovementioned ideas, we\ngain 0.5%, 1%, 4.8%, 3.4%, and 1.6% overall accuracy on the PointNeXt model\nwith real-world datasets, ScanObjectNN (hardest variant), 3DGrocery100's\nApple10, Fruits, Vegetables, and Packages subsets, respectively. We also\nachieve a comparable 0.2% accuracy gain on ModelNet40.\n","authors":["Shivanand Venkanna Sheshappanavar","Chandra Kambhamettu"],"pdf_url":"https://arxiv.org/pdf/2212.05140v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05490v2","updated":"2024-04-09T18:55:43Z","published":"2024-04-08T13:11:57Z","title":"Two-Person Interaction Augmentation with Skeleton Priors","summary":" Close and continuous interaction with rich contacts is a crucial aspect of\nhuman activities (e.g. hugging, dancing) and of interest in many domains like\nactivity recognition, motion prediction, character animation, etc. However,\nacquiring such skeletal motion is challenging. While direct motion capture is\nexpensive and slow, motion editing/generation is also non-trivial, as complex\ncontact patterns with topological and geometric constraints have to be\nretained. To this end, we propose a new deep learning method for two-body\nskeletal interaction motion augmentation, which can generate variations of\ncontact-rich interactions with varying body sizes and proportions while\nretaining the key geometric/topological relations between two bodies. Our\nsystem can learn effectively from a relatively small amount of data and\ngeneralize to drastically different skeleton sizes. Through exhaustive\nevaluation and comparison, we show it can generate high-quality motions, has\nstrong generalizability and outperforms traditional optimization-based methods\nand alternative deep learning solutions.\n","authors":["Baiyi Li","Edmond S. L. Ho","Hubert P. H. Shum","He Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05490v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06564v1","updated":"2024-04-09T18:28:55Z","published":"2024-04-09T18:28:55Z","title":"MambaAD: Exploring State Space Models for Multi-class Unsupervised\n Anomaly Detection","summary":" Recent advancements in anomaly detection have seen the efficacy of CNN- and\ntransformer-based approaches. However, CNNs struggle with long-range\ndependencies, while transformers are burdened by quadratic computational\ncomplexity. Mamba-based models, with their superior long-range modeling and\nlinear efficiency, have garnered substantial attention. This study pioneers the\napplication of Mamba to multi-class unsupervised anomaly detection, presenting\nMambaAD, which consists of a pre-trained encoder and a Mamba decoder featuring\nLocality-Enhanced State Space (LSS) modules at multi-scales. The proposed LSS\nmodule, integrating parallel cascaded (Hybrid State Space) HSS blocks and\nmulti-kernel convolutions operations, effectively captures both long-range and\nlocal information. The HSS block, utilizing (Hybrid Scanning) HS encoders,\nencodes feature maps into five scanning methods and eight directions, thereby\nstrengthening global connections through the (State Space Model) SSM. The use\nof Hilbert scanning and eight directions significantly improves feature\nsequence modeling. Comprehensive experiments on six diverse anomaly detection\ndatasets and seven metrics demonstrate SoTA performance, substantiating the\nmethod's effectiveness.\n","authors":["Haoyang He","Yuhu Bai","Jiangning Zhang","Qingdong He","Hongxu Chen","Zhenye Gan","Chengjie Wang","Xiangtai Li","Guanzhong Tian","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2404.06564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02527v3","updated":"2024-04-09T18:26:27Z","published":"2024-03-04T22:42:17Z","title":"A dataset of over one thousand computed tomography scans of battery\n cells","summary":" Battery technology is increasingly important for global electrification\nefforts. However, batteries are highly sensitive to small manufacturing\nvariations that can induce reliability or safety issues. An important\ntechnology for battery quality control is computed tomography (CT) scanning,\nwhich is widely used for non-destructive 3D inspection across a variety of\nclinical and industrial applications. Historically, however, the utility of CT\nscanning for high-volume manufacturing has been limited by its low throughput\nas well as the difficulty of handling its large file sizes. In this work, we\npresent a dataset of over one thousand CT scans of as-produced commercially\navailable batteries. The dataset spans various chemistries (lithium-ion and\nsodium-ion) as well as various battery form factors (cylindrical, pouch, and\nprismatic). We evaluate seven different battery types in total. The\nmanufacturing variability and the presence of battery defects can be observed\nvia this dataset. This dataset may be of interest to scientists and engineers\nworking on battery technology, computer vision, or both.\n","authors":["Amariah Condon","Bailey Buscarino","Eric Moch","William J. Sehnert","Owen Miles","Patrick K. Herring","Peter M. Attia"],"pdf_url":"https://arxiv.org/pdf/2403.02527v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08514v2","updated":"2024-04-09T18:23:39Z","published":"2023-12-13T21:02:03Z","title":"TAM-VT: Transformation-Aware Multi-scale Video Transformer for\n Segmentation and Tracking","summary":" Video Object Segmentation (VOS) has emerged as an increasingly important\nproblem with availability of larger datasets and more complex and realistic\nsettings, which involve long videos with global motion (e.g, in egocentric\nsettings), depicting small objects undergoing both rigid and non-rigid\n(including state) deformations. While a number of recent approaches have been\nexplored for this task, these data characteristics still present challenges. In\nthis work we propose a novel, clip-based DETR-style encoder-decoder\narchitecture, which focuses on systematically analyzing and addressing\naforementioned challenges. Specifically, we propose a novel\ntransformation-aware loss that focuses learning on portions of the video where\nan object undergoes significant deformations -- a form of \"soft\" hard examples\nmining. Further, we propose a multiplicative time-coded memory, beyond vanilla\nadditive positional encoding, which helps propagate context across long videos.\nFinally, we incorporate these in our proposed holistic multi-scale video\ntransformer for tracking via multi-scale memory matching and decoding to ensure\nsensitivity and accuracy for long videos and small objects. Our model enables\non-line inference with long videos in a windowed fashion, by breaking the video\ninto clips and propagating context among them. We illustrate that short clip\nlength and longer memory with learned time-coding are important design choices\nfor improved performance. Collectively, these technical contributions enable\nour model to achieve new state-of-the-art (SoTA) performance on two complex\negocentric datasets -- VISOR and VOST, while achieving comparable to SoTA\nresults on the conventional VOS benchmark, DAVIS'17. A series of detailed\nablations validate our design choices as well as provide insights into the\nimportance of parameter choices and their impact on performance.\n","authors":["Raghav Goyal","Wan-Cyuan Fan","Mennatullah Siam","Leonid Sigal"],"pdf_url":"https://arxiv.org/pdf/2312.08514v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06559v1","updated":"2024-04-09T18:23:34Z","published":"2024-04-09T18:23:34Z","title":"The Impact of Print-and-Scan in Heterogeneous Morph Evaluation Scenarios","summary":" Face morphing attacks present an emerging threat to the face recognition\nsystem. On top of that, printing and scanning the morphed images could obscure\nthe artifacts generated during the morphing process, which makes morphed image\ndetection even harder. In this work, we investigate the impact that printing\nand scanning has on morphing attacks through a series of heterogeneous tests.\nOur experiments show that we can increase the possibility of a false match by\nup to 5.64% for DiM and 16.00% for StyleGAN2 when providing an image that has\nbeen printed and scanned, regardless it is morphed or bona fide, to a Face\nRecognition (FR) system. Likewise, using Frechet Inception Distance (FID)\nmetric, strictly print-scanned morph attacks performed on average 9.185%\nstronger than non-print-scanned digital morphs.\n","authors":["Richard E. Neddo","Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06559v1.pdf","comment":"Initial preprint. Under review"},{"id":"http://arxiv.org/abs/2404.06542v1","updated":"2024-04-09T18:00:25Z","published":"2024-04-09T18:00:25Z","title":"Training-Free Open-Vocabulary Segmentation with Offline\n Diffusion-Augmented Prototype Generation","summary":" Open-vocabulary semantic segmentation aims at segmenting arbitrary categories\nexpressed in textual form. Previous works have trained over large amounts of\nimage-caption pairs to enforce pixel-level multimodal alignments. However,\ncaptions provide global information about the semantics of a given image but\nlack direct localization of individual concepts. Further, training on\nlarge-scale datasets inevitably brings significant computational costs. In this\npaper, we propose FreeDA, a training-free diffusion-augmented method for\nopen-vocabulary semantic segmentation, which leverages the ability of diffusion\nmodels to visually localize generated concepts and local-global similarities to\nmatch class-agnostic regions with semantic classes. Our approach involves an\noffline stage in which textual-visual reference embeddings are collected,\nstarting from a large set of captions and leveraging visual and semantic\ncontexts. At test time, these are queried to support the visual matching\nprocess, which is carried out by jointly considering class-agnostic regions and\nglobal semantic similarities. Extensive analyses demonstrate that FreeDA\nachieves state-of-the-art performance on five datasets, surpassing previous\nmethods by more than 7.0 average points in terms of mIoU and without requiring\nany training.\n","authors":["Luca Barsellotti","Roberto Amoroso","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2404.06542v1.pdf","comment":"CVPR 2024. Project page: https://aimagelab.github.io/freeda/"},{"id":"http://arxiv.org/abs/2208.11650v3","updated":"2024-04-09T17:59:34Z","published":"2022-08-24T16:40:27Z","title":"Lane Change Classification and Prediction with Action Recognition\n Networks","summary":" Anticipating lane change intentions of surrounding vehicles is crucial for\nefficient and safe driving decision making in an autonomous driving system.\nPrevious works often adopt physical variables such as driving speed,\nacceleration and so forth for lane change classification. However, physical\nvariables do not contain semantic information. Although 3D CNNs have been\ndeveloping rapidly, the number of methods utilising action recognition models\nand appearance feature for lane change recognition is low, and they all require\nadditional information to pre-process data. In this work, we propose an\nend-to-end framework including two action recognition methods for lane change\nrecognition, using video data collected by cameras. Our method achieves the\nbest lane change classification results using only the RGB video data of the\nPREVENTION dataset. Class activation maps demonstrate that action recognition\nmodels can efficiently extract lane change motions. A method to better extract\nmotion clues is also proposed in this paper.\n","authors":["Kai Liang","Jun Wang","Abhir Bhalerao"],"pdf_url":"https://arxiv.org/pdf/2208.11650v3.pdf","comment":"Accepted to ECCV2022 AVVISION"}]},"2024-04-10T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2404.07185v1","updated":"2024-04-10T17:40:27Z","published":"2024-04-10T17:40:27Z","title":"Reward Learning from Suboptimal Demonstrations with Applications in\n Surgical Electrocautery","summary":" Automating robotic surgery via learning from demonstration (LfD) techniques\nis extremely challenging. This is because surgical tasks often involve\nsequential decision-making processes with complex interactions of physical\nobjects and have low tolerance for mistakes. Prior works assume that all\ndemonstrations are fully observable and optimal, which might not be practical\nin the real world. This paper introduces a sample-efficient method that learns\na robust reward function from a limited amount of ranked suboptimal\ndemonstrations consisting of partial-view point cloud observations. The method\nthen learns a policy by optimizing the learned reward function using\nreinforcement learning (RL). We show that using a learned reward function to\nobtain a policy is more robust than pure imitation learning. We apply our\napproach on a physical surgical electrocautery task and demonstrate that our\nmethod can perform well even when the provided demonstrations are suboptimal\nand the observations are high-dimensional point clouds.\n","authors":["Zohre Karimi","Shing-Hei Ho","Bao Thach","Alan Kuntz","Daniel S. Brown"],"pdf_url":"https://arxiv.org/pdf/2404.07185v1.pdf","comment":"In proceedings of the International Symposium on Medical Robotics\n (ISMR) 2024. Equal contribution from two first authors"},{"id":"http://arxiv.org/abs/2212.11120v2","updated":"2024-04-10T17:15:23Z","published":"2022-12-10T07:50:29Z","title":"Deep Learning for Inertial Sensor Alignment","summary":" Accurate alignment of a fixed mobile device equipped with inertial sensors\ninside a moving vehicle is important for navigation, activity recognition, and\nother applications. Accurate estimation of the device mounting angle is\nrequired to rotate the inertial measurement from the sensor frame to the moving\nplatform frame to standardize measurements and improve the performance of the\ntarget task. In this work, a data-driven approach using deep neural networks\n(DNNs) is proposed to learn the yaw mounting angle of a smartphone equipped\nwith an inertial measurement unit (IMU) and strapped to a car. The proposed\nmodel uses only the accelerometer and gyroscope readings from an IMU as input\nand, in contrast to existing solutions, does not require global position inputs\nfrom global navigation satellite systems (GNSS). To train the model in a\nsupervised manner, IMU data is collected for training and validation with the\nsensor mounted at a known yaw mounting angle, and a range of ground truth\nlabels is generated by applying a random rotation in a bounded range to the\nmeasurements. The trained model is tested on data with real rotations showing\nsimilar performance as with synthetic rotations. The trained model is deployed\non an Android device and evaluated in real-time to test the accuracy of the\nestimated yaw mounting angle. The model is shown to find the mounting angle at\nan accuracy of 8 degrees within 5 seconds, and 4 degrees within 27 seconds. An\nexperiment is conducted to compare the proposed model with an existing\noff-the-shelf solution.\n","authors":["Maxim Freydin","Niv Sfaradi","Nimrod Segol","Areej Eweida","Barak Or"],"pdf_url":"https://arxiv.org/pdf/2212.11120v2.pdf","comment":"9 Pages, Preprint. Accepted IEEE"},{"id":"http://arxiv.org/abs/2404.07168v1","updated":"2024-04-10T17:04:06Z","published":"2024-04-10T17:04:06Z","title":"Using Neural Networks to Model Hysteretic Kinematics in Tendon-Actuated\n Continuum Robots","summary":" The ability to accurately model mechanical hysteretic behavior in\ntendon-actuated continuum robots using deep learning approaches is a growing\narea of interest. In this paper, we investigate the hysteretic response of two\ntypes of tendon-actuated continuum robots and, ultimately, compare three types\nof neural network modeling approaches with both forward and inverse kinematic\nmappings: feedforward neural network (FNN), FNN with a history input buffer,\nand long short-term memory (LSTM) network. We seek to determine which model\nbest captures temporal dependent behavior. We find that, depending on the\nrobot's design, choosing different kinematic inputs can alter whether\nhysteresis is exhibited by the system. Furthermore, we present the results of\nthe model fittings, revealing that, in contrast to the standard FNN, both FNN\nwith a history input buffer and the LSTM model exhibit the capacity to model\nhistorical dependence with comparable performance in capturing rate-dependent\nhysteresis.\n","authors":["Yuan Wang","Max McCandless","Abdulhamit Donder","Giovanni Pittiglio","Behnam Moradkhani","Yash Chitalia","Pierre E. Dupont"],"pdf_url":"https://arxiv.org/pdf/2404.07168v1.pdf","comment":"7 pages, 8 figures, conference"},{"id":"http://arxiv.org/abs/2404.07158v1","updated":"2024-04-10T16:49:39Z","published":"2024-04-10T16:49:39Z","title":"CBFKIT: A Control Barrier Function Toolbox for Robotics Applications","summary":" This paper introduces CBFKit, a Python/ROS toolbox for safe robotics planning\nand control under uncertainty. The toolbox provides a general framework for\ndesigning control barrier functions for mobility systems within both\ndeterministic and stochastic environments. It can be connected to the ROS\nopen-source robotics middleware, allowing for the setup of multi-robot\napplications, encoding of environments and maps, and integrations with\npredictive motion planning algorithms. Additionally, it offers multiple CBF\nvariations and algorithms for robot control. The CBFKit is demonstrated on the\nToyota Human Support Robot (HSR) in both simulation and in physical\nexperiments.\n","authors":["Mitchell Black","Georgios Fainekos","Bardh Hoxha","Hideki Okamoto","Danil Prokhorov"],"pdf_url":"https://arxiv.org/pdf/2404.07158v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2310.16077v3","updated":"2024-04-10T16:47:07Z","published":"2023-10-24T17:37:52Z","title":"Femtosecond laser fabricated nitinol living hinges for millimeter-sized\n robots","summary":" Nitinol is a smart material that can be used as an actuator, a sensor, or a\nstructural element, and has the potential to significantly enhance the\ncapabilities of microrobots. Femtosecond laser technology can be used to\nprocess nitinol while avoiding heat-affected zones (HAZ), thus retaining\nsuperelastic properties. In this work, we manufacture living hinges of\narbitrary cross-sections from nitinol using a femtosecond laser micromachining\nprocess. We first determined the laser cutting parameters, 4.1 Jcm^-2 fluence\nwith 5 passes for 5 um ablation, by varying laser power level and number of\npasses. Next, we modeled the hinges using an analytical model as well as\ncreating an Abaqus finite element method, and showed the accuracy of the models\nby comparing them to the torque produced by eight different hinges, four with a\nrectangular cross-section and four with an arc cross-section. Finally, we\nmanufactured three prototype miniature devices to illustrate the usefulness of\nthese nitinol hinges: a sample spherical 5-bar mechanism, a sarrus linkage, and\na piezoelectric actuated robotic wing mechanism.\n","authors":["Alexander Hedrick","Heiko Kabutz","Lawrence Smith","Robert MacCurdy","Kaushik Jayaram"],"pdf_url":"https://arxiv.org/pdf/2310.16077v3.pdf","comment":"7 pages, 4 figures, submitted to IEEE RA-L"},{"id":"http://arxiv.org/abs/2402.16101v2","updated":"2024-04-10T16:20:45Z","published":"2024-02-25T15:00:06Z","title":"Optimizing Base Placement of Surgical Robot: Kinematics Data-Driven\n Approach by Analyzing Working Pattern","summary":" In robot-assisted minimally invasive surgery (RAMIS), optimal placement of\nthe surgical robot base is crucial for successful surgery. Improper placement\ncan hinder performance because of manipulator limitations and inaccessible\nworkspaces. Conventional base placement relies on the experience of trained\nmedical staff. This study proposes a novel method for determining the optimal\nbase pose based on the surgeon's working pattern. The proposed method analyzes\nrecorded end-effector poses using a machine learning-based clustering technique\nto identify key positions and orientations preferred by the surgeon. We\nintroduce two scoring metrics to address the joint limit and singularity\nissues: joint margin and manipulability scores. We then train a multi-layer\nperceptron regressor to predict the optimal base pose based on these scores.\nEvaluation in a simulated environment using the da Vinci Research Kit shows\nunique base pose score maps for four volunteers, highlighting the individuality\nof the working patterns. Results comparing with 20,000 randomly selected base\nposes suggest that the score obtained using the proposed method is 28.2% higher\nthan that obtained by random base placement. These results emphasize the need\nfor operator-specific optimization during base placement in RAMIS.\n","authors":["Jeonghyeon Yoon","Junhyun Park","Hyojae Park","Hakyoon Lee","Sangwon Lee","Minho Hwang"],"pdf_url":"https://arxiv.org/pdf/2402.16101v2.pdf","comment":"8 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.00068v2","updated":"2024-04-10T16:04:48Z","published":"2023-11-29T20:59:00Z","title":"GLiDR: Topologically Regularized Graph Generative Network for Sparse\n LiDAR Point Clouds","summary":" Sparse LiDAR point clouds cause severe loss of detail of static structures\nand reduce the density of static points available for navigation. Reduced\ndensity can be detrimental to navigation under several scenarios. We observe\nthat despite high sparsity, in most cases, the global topology of LiDAR\noutlining the static structures can be inferred. We utilize this property to\nobtain a backbone skeleton of a LiDAR scan in the form of a single connected\ncomponent that is a proxy to its global topology. We utilize the backbone to\naugment new points along static structures to overcome sparsity. Newly\nintroduced points could correspond to existing static structures or to static\npoints that were earlier obstructed by dynamic objects. To the best of our\nknowledge, we are the first to use such a strategy for sparse LiDAR point\nclouds. Existing solutions close to our approach fail to identify and preserve\nthe global static LiDAR topology and generate sub-optimal points. We propose\nGLiDR, a Graph Generative network that is topologically regularized using\n0-dimensional Persistent Homology ($\\mathcal{PH}$) constraints. This enables\nGLiDR to introduce newer static points along a topologically consistent global\nstatic LiDAR backbone. GLiDR generates precise static points using $32\\times$\nsparser dynamic scans and performs better than the baselines across three\ndatasets. GLiDR generates a valuable byproduct - an accurate binary\nsegmentation mask of static and dynamic objects that are helpful for navigation\nplanning and safety in constrained environments. The newly introduced static\npoints allow GLiDR to outperform LiDAR-based navigation using SLAM in several\nsettings. Source code is available at\n$\\texttt{https://github.com/GLiDR-CVPR2024/GLiDR}$.\n","authors":["Prashant Kumar","Kshitij Madhav Bhat","Vedang Bhupesh Shenvi Nadkarni","Prem Kalra"],"pdf_url":"https://arxiv.org/pdf/2312.00068v2.pdf","comment":"IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR)"},{"id":"http://arxiv.org/abs/2404.07110v1","updated":"2024-04-10T15:47:35Z","published":"2024-04-10T15:47:35Z","title":"Wild Visual Navigation: Fast Traversability Learning via Pre-Trained\n Models and Online Self-Supervision","summary":" Natural environments such as forests and grasslands are challenging for\nrobotic navigation because of the false perception of rigid obstacles from high\ngrass, twigs, or bushes. In this work, we present Wild Visual Navigation (WVN),\nan online self-supervised learning system for visual traversability estimation.\nThe system is able to continuously adapt from a short human demonstration in\nthe field, only using onboard sensing and computing. One of the key ideas to\nachieve this is the use of high-dimensional features from pre-trained\nself-supervised models, which implicitly encode semantic information that\nmassively simplifies the learning task. Further, the development of an online\nscheme for supervision generator enables concurrent training and inference of\nthe learned model in the wild. We demonstrate our approach through diverse\nreal-world deployments in forests, parks, and grasslands. Our system is able to\nbootstrap the traversable terrain segmentation in less than 5 min of in-field\ntraining time, enabling the robot to navigate in complex, previously unseen\noutdoor terrains. Code: https://bit.ly/498b0CV - Project\npage:https://bit.ly/3M6nMHH\n","authors":["Matías Mattamala","Jonas Frey","Piotr Libera","Nived Chebrolu","Georg Martius","Cesar Cadena","Marco Hutter","Maurice Fallon"],"pdf_url":"https://arxiv.org/pdf/2404.07110v1.pdf","comment":"Extended version of arXiv:2305.08510"},{"id":"http://arxiv.org/abs/2401.10831v3","updated":"2024-04-10T15:19:07Z","published":"2024-01-19T17:27:21Z","title":"Understanding Video Transformers via Universal Concept Discovery","summary":" This paper studies the problem of concept-based interpretability of\ntransformer representations for videos. Concretely, we seek to explain the\ndecision-making process of video transformers based on high-level,\nspatiotemporal concepts that are automatically discovered. Prior research on\nconcept-based interpretability has concentrated solely on image-level tasks.\nComparatively, video models deal with the added temporal dimension, increasing\ncomplexity and posing challenges in identifying dynamic concepts over time. In\nthis work, we systematically address these challenges by introducing the first\nVideo Transformer Concept Discovery (VTCD) algorithm. To this end, we propose\nan efficient approach for unsupervised identification of units of video\ntransformer representations - concepts, and ranking their importance to the\noutput of a model. The resulting concepts are highly interpretable, revealing\nspatio-temporal reasoning mechanisms and object-centric representations in\nunstructured video models. Performing this analysis jointly over a diverse set\nof supervised and self-supervised representations, we discover that some of\nthese mechanism are universal in video transformers. Finally, we show that VTCD\ncan be used for fine-grained action recognition and video object segmentation.\n","authors":["Matthew Kowal","Achal Dave","Rares Ambrus","Adrien Gaidon","Konstantinos G. Derpanis","Pavel Tokmakov"],"pdf_url":"https://arxiv.org/pdf/2401.10831v3.pdf","comment":"CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2404.07063v1","updated":"2024-04-10T14:52:35Z","published":"2024-04-10T14:52:35Z","title":"LaPlaSS: Latent Space Planning for Stochastic Systems","summary":" Autonomous mobile agents often operate in hazardous environments,\nnecessitating an awareness of safety. These agents can have non-linear,\nstochastic dynamics that must be considered during planning to guarantee\nbounded risk. Most state of the art methods require closed-form dynamics to\nverify plan correctness and safety however modern robotic systems often have\ndynamics that are learned from data. Thus, there is a need to perform efficient\ntrajectory planning with guarantees on risk for agents without known dynamics\nmodels. We propose a \"generate-and-test\" approach to risk-bounded planning in\nwhich a planner generates a candidate trajectory using an approximate linear\ndynamics model and a validator assesses the risk of the trajectory, computing\nadditional safety constraints for the planner if the candidate does not satisfy\nthe desired risk bound. To acquire the approximate model, we use a variational\nautoencoder to learn a latent linear dynamics model and encode the planning\nproblem into the latent space to generate the candidate trajectory. The VAE\nalso serves to sample trajectories around the candidate to use in the\nvalidator. We demonstrate that our algorithm, LaPlaSS, is able to generate\ntrajectory plans with bounded risk for a real-world agent with learned dynamics\nand is an order of magnitude more efficient than the state of the art.\n","authors":["Marlyse Reeves","Brian C. Williams"],"pdf_url":"https://arxiv.org/pdf/2404.07063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14432v3","updated":"2024-04-10T14:17:36Z","published":"2024-02-22T10:22:27Z","title":"Exploring the Influence of Driving Context on Lateral Driving Style\n Preferences: A Simulator-Based Study","summary":" Technological advancements focus on developing comfortable and acceptable\ndriving characteristics in autonomous vehicles. Present driving functions\npredominantly possess predefined parameters, and there is no universally\naccepted driving style for autonomous vehicles. While driving may be\ntechnically safe and the likelihood of road accidents is reduced, passengers\nmay still feel insecure due to a mismatch in driving styles between the human\nand the autonomous system. Incorporating driving style preferences into\nautomated vehicles enhances acceptance, reduces uncertainty, and poses the\nopportunity to expedite their adoption. Despite the increased research focus on\ndriving styles, there remains a need for comprehensive studies investigating\nhow variations in the driving context impact the assessment of automated\ndriving functions. Therefore, this work evaluates lateral driving style\npreferences for autonomous vehicles on rural roads, considering different\nweather and traffic situations. A controlled study was conducted with a variety\nof German participants utilizing a high-fidelity driving simulator. The\nsubjects experienced four different driving styles, including mimicking of\ntheir own driving behavior under two weather conditions. A notable preference\nfor a more passive driving style became evident based on statistical analyses\nof participants' responses during and after the drives. This study could not\nconfirm the hypothesis that subjects prefer to be driven by mimicking their own\ndriving behavior. Furthermore, the study illustrated that weather conditions\nand oncoming traffic substantially influence the perceived comfort during\nautonomous rides. The gathered dataset is openly accessible at\nhttps://www.kaggle.com/datasets/jhaselberger/idcld-subject-study-on-driving-style-preferences.\n","authors":["Johann Haselberger","Maximilian Böhle","Bernhard Schick","Steffen Müller"],"pdf_url":"https://arxiv.org/pdf/2402.14432v3.pdf","comment":"19 pages, 5 figures; This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2402.03893v2","updated":"2024-04-10T13:34:24Z","published":"2024-02-06T10:58:13Z","title":"Prediction Horizon Requirements for Automated Driving: Optimizing\n Safety, Comfort, and Efficiency","summary":" Predicting the movement of other road users is beneficial for improving\nautomated vehicle (AV) performance. However, the relationship between the time\nhorizon associated with these predictions and AV performance remains unclear.\nDespite the existence of numerous trajectory prediction algorithms, no studies\nhave been conducted on how varying prediction lengths affect AV safety and\nother vehicle performance metrics, resulting in undefined horizon requirements\nfor prediction methods. Our study addresses this gap by examining the effects\nof different prediction horizons on AV performance, focusing on safety,\ncomfort, and efficiency. Through multiple experiments using a state-of-the-art,\nrisk-based predictive trajectory planner, we simulated predictions with\nhorizons up to 20 seconds. Based on our simulations, we propose a framework for\nspecifying the minimum required and optimal prediction horizons based on\nspecific AV performance criteria and application needs. Our results indicate\nthat a horizon of 1.6 seconds is required to prevent collisions with crossing\npedestrians, horizons of 7-8 seconds yield the best efficiency, and horizons up\nto 15 seconds improve passenger comfort. We conclude that prediction horizon\nrequirements are application-dependent, and recommend aiming for a prediction\nhorizon of 11.8 seconds as a general guideline for applications involving\ncrossing pedestrians.\n","authors":["Manuel Muñoz Sánchez","Chris van der Ploeg","Robin Smit","Jos Elfring","Emilia Silvas","René van de Molengraft"],"pdf_url":"https://arxiv.org/pdf/2402.03893v2.pdf","comment":"Submitted to IEEE Intelligent Vehicles Symposium. 9 pages. 10\n figures. 6 tables"},{"id":"http://arxiv.org/abs/2404.00066v2","updated":"2024-04-10T12:54:30Z","published":"2024-03-28T08:40:53Z","title":"Local Observability of VINS and LINS","summary":" This work analyzes unobservable directions of Vision-aided Inertial\nNavigation System (VINS) and Lidar-aided Inertial Navigation System (LINS)\nnonlinear model. Under the assumption that there exist two features observed by\nthe camera without occlusion, the unobservable directions of VINS are uniformly\nglobally translation and global rotations about the gravity vector. The\nunobservable directions of LINS are same as VINS, while only one feature need\nto be observed. Also, a constraint in Observability-Constrained VINS (OC-VINS)\nis proved.\n","authors":["Xinran Li"],"pdf_url":"https://arxiv.org/pdf/2404.00066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06974v1","updated":"2024-04-10T12:38:38Z","published":"2024-04-10T12:38:38Z","title":"Deep Reinforcement Learning for Mobile Robot Path Planning","summary":" Path planning is an important problem with the the applications in many\naspects, such as video games, robotics etc. This paper proposes a novel method\nto address the problem of Deep Reinforcement Learning (DRL) based path planning\nfor a mobile robot. We design DRL-based algorithms, including reward functions,\nand parameter optimization, to avoid time-consuming work in a 2D environment.\nWe also designed an Two-way search hybrid A* algorithm to improve the quality\nof local path planning. We transferred the designed algorithm to a simple\nembedded environment to test the computational load of the algorithm when\nrunning on a mobile robot. Experiments show that when deployed on a robot\nplatform, the DRL-based algorithm in this article can achieve better planning\nresults and consume less computing resources.\n","authors":["Hao Liu","Yi Shen","Shuangjiang Yu","Zijun Gao","Tong Wu"],"pdf_url":"https://arxiv.org/pdf/2404.06974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01654v3","updated":"2024-04-10T12:15:46Z","published":"2023-08-03T09:37:34Z","title":"Towards a Safe Real-Time Motion Planning Framework for Autonomous\n Driving Systems: An MPPI Approach","summary":" Planning safe trajectories in Autonomous Driving Systems (ADS) is a complex\nproblem to solve in real-time. The main challenge to solve this problem arises\nfrom the various conditions and constraints imposed by road geometry, semantics\nand traffic rules, as well as the presence of dynamic agents. Recently, Model\nPredictive Path Integral (MPPI) has shown to be an effective framework for\noptimal motion planning and control in robot navigation in unstructured and\nhighly uncertain environments. In this paper, we formulate the motion planning\nproblem in ADS as a nonlinear stochastic dynamic optimization problem that can\nbe solved using an MPPI strategy. The main technical contribution of this work\nis a method to handle obstacles within the MPPI formulation safely. In this\nmethod, obstacles are approximated by circles that can be easily integrated\ninto the MPPI cost formulation while considering safety margins. The proposed\nMPPI framework has been efficiently implemented in our autonomous vehicle and\nexperimentally validated using three different primitive scenarios.\nExperimental results show that generated trajectories are safe, feasible and\nperfectly achieve the planning objective. The video results as well as the\nopen-source implementation are available at:\nhttps://gitlab.uni.lu/360lab-public/mppi\n","authors":["Mehdi Testouri","Gamal Elghazaly","Raphael Frank"],"pdf_url":"https://arxiv.org/pdf/2308.01654v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06940v1","updated":"2024-04-10T11:45:31Z","published":"2024-04-10T11:45:31Z","title":"Robotic Learning for Adaptive Informative Path Planning","summary":" Adaptive informative path planning (AIPP) is important to many robotics\napplications, enabling mobile robots to efficiently collect useful data about\ninitially unknown environments. In addition, learning-based methods are\nincreasingly used in robotics to enhance adaptability, versatility, and\nrobustness across diverse and complex tasks. Our survey explores research on\napplying robotic learning to AIPP, bridging the gap between these two research\nfields. We begin by providing a unified mathematical framework for general AIPP\nproblems. Next, we establish two complementary taxonomies of current work from\nthe perspectives of (i) learning algorithms and (ii) robotic applications. We\nexplore synergies, recent trends, and highlight the benefits of learning-based\nmethods in AIPP frameworks. Finally, we discuss key challenges and promising\nfuture directions to enable more generally applicable and robust robotic\ndata-gathering systems through learning. We provide a comprehensive catalogue\nof papers reviewed in our survey, including publicly available repositories, to\nfacilitate future studies in the field.\n","authors":["Marija Popovic","Joshua Ott","Julius Rückin","Mykel J. Kochendorfer"],"pdf_url":"https://arxiv.org/pdf/2404.06940v1.pdf","comment":"22 pages, 1 figure"},{"id":"http://arxiv.org/abs/2404.06926v1","updated":"2024-04-10T11:24:34Z","published":"2024-04-10T11:24:34Z","title":"Gaussian-LIC: Photo-realistic LiDAR-Inertial-Camera SLAM with 3D\n Gaussian Splatting","summary":" We present a real-time LiDAR-Inertial-Camera SLAM system with 3D Gaussian\nSplatting as the mapping backend. Leveraging robust pose estimates from our\nLiDAR-Inertial-Camera odometry, Coco-LIC, an incremental photo-realistic\nmapping system is proposed in this paper. We initialize 3D Gaussians from\ncolorized LiDAR points and optimize them using differentiable rendering powered\nby 3D Gaussian Splatting. Meticulously designed strategies are employed to\nincrementally expand the Gaussian map and adaptively control its density,\nensuring high-quality mapping with real-time capability. Experiments conducted\nin diverse scenarios demonstrate the superior performance of our method\ncompared to existing radiance-field-based SLAM systems.\n","authors":["Xiaolei Lang","Laijian Li","Hang Zhang","Feng Xiong","Mu Xu","Yong Liu","Xingxing Zuo","Jiajun Lv"],"pdf_url":"https://arxiv.org/pdf/2404.06926v1.pdf","comment":"Submitted to IROS 2024"},{"id":"http://arxiv.org/abs/2404.06904v1","updated":"2024-04-10T10:49:43Z","published":"2024-04-10T10:49:43Z","title":"Vision-Language Model-based Physical Reasoning for Robot Liquid\n Perception","summary":" There is a growing interest in applying large language models (LLMs) in\nrobotic tasks, due to their remarkable reasoning ability and extensive\nknowledge learned from vast training corpora. Grounding LLMs in the physical\nworld remains an open challenge as they can only process textual input. Recent\nadvancements in large vision-language models (LVLMs) have enabled a more\ncomprehensive understanding of the physical world by incorporating visual\ninput, which provides richer contextual information than language alone. In\nthis work, we proposed a novel paradigm that leveraged GPT-4V(ision), the\nstate-of-the-art LVLM by OpenAI, to enable embodied agents to perceive liquid\nobjects via image-based environmental feedback. Specifically, we exploited the\nphysical understanding of GPT-4V to interpret the visual representation (e.g.,\ntime-series plot) of non-visual feedback (e.g., F/T sensor data), indirectly\nenabling multimodal perception beyond vision and language using images as\nproxies. We evaluated our method using 10 common household liquids with\ncontainers of various geometry and material. Without any training or\nfine-tuning, we demonstrated that our method can enable the robot to indirectly\nperceive the physical response of liquids and estimate their viscosity. We also\nshowed that by jointly reasoning over the visual and physical attributes\nlearned through interactions, our method could recognize liquid objects in the\nabsence of strong visual cues (e.g., container labels with legible text or\nsymbols), increasing the accuracy from 69.0% -- achieved by the best-performing\nvision-only variant -- to 86.0%.\n","authors":["Wenqiang Lai","Yuan Gao","Tin Lun Lam"],"pdf_url":"https://arxiv.org/pdf/2404.06904v1.pdf","comment":"8 pages, 6 figures, submitted to IROS 2024"},{"id":"http://arxiv.org/abs/2401.05152v2","updated":"2024-04-10T10:08:33Z","published":"2024-01-10T13:32:01Z","title":"Multi S-Graphs: An Efficient Distributed Semantic-Relational\n Collaborative SLAM","summary":" Collaborative Simultaneous Localization and Mapping (CSLAM) is critical to\nenable multiple robots to operate in complex environments. Most CSLAM\ntechniques rely on raw sensor measurement or low-level features such as\nkeyframe descriptors, which can lead to wrong loop closures due to the lack of\ndeep understanding of the environment. Moreover, the exchange of these\nmeasurements and low-level features among the robots requires the transmission\nof a significant amount of data, which limits the scalability of the system. To\novercome these limitations, we present Multi S-Graphs, a decentralized CSLAM\nsystem that utilizes high-level semantic-relational information embedded in the\nfour-layered hierarchical and optimizable situational graphs for cooperative\nmap generation and localization in structured environments while minimizing the\ninformation exchanged between the robots. To support this, we present a novel\nroom-based descriptor which, along with its connected walls, is used to perform\ninter-robot loop closures, addressing the challenges of multi-robot kidnapped\nproblem initialization. Multiple experiments in simulated and real environments\nvalidate the improvement in accuracy and robustness of the proposed approach\nwhile reducing the amount of data exchanged between robots compared to other\nstate-of-the-art approaches.\n Software available within a docker image:\nhttps://github.com/snt-arg/multi_s_graphs_docker\n","authors":["Miguel Fernandez-Cortizas","Hriday Bavle","David Perez-Saura","Jose Luis Sanchez-Lopez","Pascual Campoy","Holger Voos"],"pdf_url":"https://arxiv.org/pdf/2401.05152v2.pdf","comment":"8 pages paper presented to IEEE RA-L"},{"id":"http://arxiv.org/abs/2402.11319v2","updated":"2024-04-10T08:31:08Z","published":"2024-02-17T16:20:59Z","title":"Hysteresis Compensation of Flexible Continuum Manipulator using RGBD\n Sensing and Temporal Convolutional Network","summary":" Flexible continuum manipulators are valued for minimally invasive surgery,\noffering access to confined spaces through nonlinear paths. However,\ncable-driven manipulators face control difficulties due to hysteresis from\ncabling effects such as friction, elongation, and coupling. These effects are\ndifficult to model due to nonlinearity and the difficulties become even more\nevident when dealing with long and coupled, multi-segmented manipulator. This\npaper proposes a data-driven approach based on Deep Neural Networks (DNN) to\ncapture these nonlinear and previous states-dependent characteristics of cable\nactuation. We collect physical joint configurations according to command joint\nconfigurations using RGBD sensing and 7 fiducial markers to model the\nhysteresis of the proposed manipulator. Result on a study comparing the\nestimation performance of four DNN models show that the Temporal Convolution\nNetwork (TCN) demonstrates the highest predictive capability. Leveraging\ntrained TCNs, we build a control algorithm to compensate for hysteresis.\nTracking tests in task space using unseen trajectories show that the proposed\ncontrol algorithm reduces the average position and orientation error by 61.39%\n(from 13.7mm to 5.29 mm) and 64.04% (from 31.17{\\deg} to 11.21{\\deg}),\nrespectively. This result implies that the proposed calibrated controller\neffectively reaches the desired configurations by estimating the hysteresis of\nthe manipulator. Applying this method in real surgical scenarios has the\npotential to enhance control precision and improve surgical performance.\n","authors":["Junhyun Park","Seonghyeok Jang","Hyojae Park","Seongjun Bae","Minho Hwang"],"pdf_url":"https://arxiv.org/pdf/2402.11319v2.pdf","comment":"8 pages, 11 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.06807v1","updated":"2024-04-10T07:55:01Z","published":"2024-04-10T07:55:01Z","title":"Sound Matters: Auditory Detectability of Mobile Robots","summary":" Mobile robots are increasingly being used in noisy environments for social\npurposes, e.g. to provide support in healthcare or public spaces. Since these\nrobots also operate beyond human sight, the question arises as to how different\nrobot types, ambient noise or cognitive engagement impacts the detection of the\nrobots by their sound. To address this research gap, we conducted a user study\nmeasuring auditory detection distances for a wheeled (Turtlebot 2i) and\nquadruped robot (Unitree Go 1), which emit different consequential sounds when\nmoving. Additionally, we also manipulated background noise levels and\nparticipants' engagement in a secondary task during the study. Our results\nshowed that the quadruped robot sound was detected significantly better (i.e.,\nat a larger distance) than the wheeled one, which demonstrates that the\nmovement mechanism has a meaningful impact on the auditory detectability. The\ndetectability for both robots diminished significantly as background noise\nincreased. But even in high background noise, participants detected the\nquadruped robot at a significantly larger distance. The engagement in a\nsecondary task had hardly any impact. In essence, these findings highlight the\ncritical role of distinguishing auditory characteristics of different robots to\nimprove the smooth human-centered navigation of mobile robots in noisy\nenvironments.\n","authors":["Subham Agrawal","Marlene Wessels","Jorge de Heuvel","Johannes Kraus","Maren Bennewitz"],"pdf_url":"https://arxiv.org/pdf/2404.06807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15361v2","updated":"2024-04-10T06:46:08Z","published":"2023-11-26T17:27:26Z","title":"Ultra-Range Gesture Recognition using a Web-Camera in Human-Robot\n Interaction","summary":" Hand gestures play a significant role in human interactions where non-verbal\nintentions, thoughts and commands are conveyed. In Human-Robot Interaction\n(HRI), hand gestures offer a similar and efficient medium for conveying clear\nand rapid directives to a robotic agent. However, state-of-the-art vision-based\nmethods for gesture recognition have been shown to be effective only up to a\nuser-camera distance of seven meters. Such a short distance range limits\npractical HRI with, for example, service robots, search and rescue robots and\ndrones. In this work, we address the Ultra-Range Gesture Recognition (URGR)\nproblem by aiming for a recognition distance of up to 25 meters and in the\ncontext of HRI. We propose the URGR framework, a novel deep-learning, using\nsolely a simple RGB camera. Gesture inference is based on a single image.\nFirst, a novel super-resolution model termed High-Quality Network (HQ-Net) uses\na set of self-attention and convolutional layers to enhance the low-resolution\nimage of the user. Then, we propose a novel URGR classifier termed Graph Vision\nTransformer (GViT) which takes the enhanced image as input. GViT combines the\nbenefits of a Graph Convolutional Network (GCN) and a modified Vision\nTransformer (ViT). Evaluation of the proposed framework over diverse test data\nyields a high recognition rate of 98.1%. The framework has also exhibited\nsuperior performance compared to human recognition in ultra-range distances.\nWith the framework, we analyze and demonstrate the performance of an autonomous\nquadruped robot directed by human gestures in complex ultra-range indoor and\noutdoor environments, acquiring 96% recognition rate on average.\n","authors":["Eran Bamani","Eden Nissinman","Inbar Meir","Lisa Koenigsberg","Avishai Sintov"],"pdf_url":"https://arxiv.org/pdf/2311.15361v2.pdf","comment":"Engineering Applications of Artificial Intelligence, In press"},{"id":"http://arxiv.org/abs/2404.06772v1","updated":"2024-04-10T06:28:19Z","published":"2024-04-10T06:28:19Z","title":"Beyond Gait: Learning Knee Angle for Seamless Prosthesis Control in\n Multiple Scenarios","summary":" Deep learning models have become a powerful tool in knee angle estimation for\nlower limb prostheses, owing to their adaptability across various gait phases\nand locomotion modes. Current methods utilize Multi-Layer Perceptrons (MLP),\nLong-Short Term Memory Networks (LSTM), and Convolutional Neural Networks\n(CNN), predominantly analyzing motion information from the thigh. Contrary to\nthese approaches, our study introduces a holistic perspective by integrating\nwhole-body movements as inputs. We propose a transformer-based probabilistic\nframework, termed the Angle Estimation Probabilistic Model (AEPM), that offers\nprecise angle estimations across extensive scenarios beyond walking. AEPM\nachieves an overall RMSE of 6.70 degrees, with an RMSE of 3.45 degrees in\nwalking scenarios. Compared to the state of the art, AEPM has improved the\nprediction accuracy for walking by 11.31%. Our method can achieve seamless\nadaptation between different locomotion modes. Also, this model can be utilized\nto analyze the synergy between the knee and other joints. We reveal that the\nwhole body movement has valuable information for knee movement, which can\nprovide insights into designing sensors for prostheses. The code is available\nat https://github.com/penway/Beyond-Gait-AEPM.\n","authors":["Pengwei Wang","Yilong Chen","Wan Su","Jie Wang","Teng Ma","Haoyong Yu"],"pdf_url":"https://arxiv.org/pdf/2404.06772v1.pdf","comment":"8 pages, 6 figures, This work has been submitted to the IEEE-RAL for\n possible publication"},{"id":"http://arxiv.org/abs/2404.06758v1","updated":"2024-04-10T05:54:40Z","published":"2024-04-10T05:54:40Z","title":"Toward Holistic Planning and Control Optimization for Dual-Arm\n Rearrangement","summary":" Long-horizon task and motion planning (TAMP) is notoriously difficult to\nsolve, let alone optimally, due to the tight coupling between the interleaved\n(discrete) task and (continuous) motion planning phases, where each phase on\nits own is frequently an NP-hard or even PSPACE-hard computational challenge.\nIn this study, we tackle the even more challenging goal of jointly optimizing\ntask and motion plans for a real dual-arm system in which the two arms operate\nin close vicinity to solve highly constrained tabletop multi-object\nrearrangement problems. Toward that, we construct a tightly integrated planning\nand control optimization pipeline, Makespan-Optimized Dual-Arm Planner (MODAP)\nthat combines novel sampling techniques for task planning with state-of-the-art\ntrajectory optimization techniques. Compared to previous state-of-the-art,\nMODAP produces task and motion plans that better coordinate a dual-arm system,\ndelivering significantly improved execution time improvements while\nsimultaneously ensuring that the resulting time-parameterized trajectory\nconforms to specified acceleration and jerk limits.\n","authors":["Kai Gao","Zihe Ye","Duo Zhang","Baichuan Huang","Jingjin Yu"],"pdf_url":"https://arxiv.org/pdf/2404.06758v1.pdf","comment":"First three authors made equal contributions to this study"},{"id":"http://arxiv.org/abs/2404.06740v1","updated":"2024-04-10T04:57:44Z","published":"2024-04-10T04:57:44Z","title":"Designing Fluid-Exuding Cartilage for Biomimetic Robots Mimicking Human\n Joint Lubrication Function","summary":" The human joint is an open-type joint composed of bones, cartilage,\nligaments, synovial fluid, and joint capsule, having advantages of flexibility\nand impact resistance. However, replicating this structure in robots introduces\nfriction challenges due to the absence of bearings. To address this, our study\nfocuses on mimicking the fluid-exuding function of human cartilage. We employ a\nrubber-based 3D printing technique combined with absorbent materials to create\na versatile and easily designed cartilage sheet for biomimetic robots. We\nevaluate both the fluid-exuding function and friction coefficient of the\nfabricated flat cartilage sheet. Furthermore, we practically create a piece of\ncurved cartilage and an open-type biomimetic ball joint in combination with\nbones, ligaments, synovial fluid, and joint capsule to demonstrate the utility\nof the proposed cartilage sheet in the construction of such joints.\n","authors":["Akihiro Miki","Yuta Sahara","Kazuhiro Miyama","Shunnosuke Yoshimura","Yoshimoto Ribayashi","Shun Hasegawa","Kento Kawaharazuka","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2404.06740v1.pdf","comment":"Accepted at RoboSoft2024"},{"id":"http://arxiv.org/abs/2211.02736v4","updated":"2024-04-10T04:51:33Z","published":"2022-11-04T20:22:58Z","title":"Discovering Closed-Loop Failures of Vision-Based Controllers via\n Reachability Analysis","summary":" Machine learning driven image-based controllers allow robotic systems to take\nintelligent actions based on the visual feedback from their environment.\nUnderstanding when these controllers might lead to system safety violations is\nimportant for their integration in safety-critical applications and engineering\ncorrective safety measures for the system. Existing methods leverage\nsimulation-based testing (or falsification) to find the failures of\nvision-based controllers, i.e., the visual inputs that lead to closed-loop\nsafety violations. However, these techniques do not scale well to the scenarios\ninvolving high-dimensional and complex visual inputs, such as RGB images. In\nthis work, we cast the problem of finding closed-loop vision failures as a\nHamilton-Jacobi (HJ) reachability problem. Our approach blends simulation-based\nanalysis with HJ reachability methods to compute an approximation of the\nbackward reachable tube (BRT) of the system, i.e., the set of unsafe states for\nthe system under vision-based controllers. Utilizing the BRT, we can tractably\nand systematically find the system states and corresponding visual inputs that\nlead to closed-loop failures. These visual inputs can be subsequently analyzed\nto find the input characteristics that might have caused the failure. Besides\nits scalability to high-dimensional visual inputs, an explicit computation of\nBRT allows the proposed approach to capture non-trivial system failures that\nare difficult to expose via random simulations. We demonstrate our framework on\ntwo case studies involving an RGB image-based neural network controller for (a)\nautonomous indoor navigation, and (b) autonomous aircraft taxiing.\n","authors":["Kaustav Chakraborty","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2211.02736v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14176v2","updated":"2024-04-10T04:38:52Z","published":"2024-03-21T06:57:28Z","title":"ReFeree: Radar-based efficient global descriptor using a Feature and\n Free space for Place Recognition","summary":" Radar is highlighted for robust sensing capabilities in adverse weather\nconditions (e.g. dense fog, heavy rain, or snowfall). In addition, Radar can\ncover wide areas and penetrate small particles. Despite these advantages,\nRadar-based place recognition remains in the early stages compared to other\nsensors due to its unique characteristics such as low resolution, and\nsignificant noise. In this paper, we propose a Radarbased place recognition\nutilizing a descriptor called ReFeree using a feature and free space. Unlike\ntraditional methods, we overwhelmingly summarize the Radar image. Despite being\nlightweight, it contains semi-metric information and is also outstanding from\nthe perspective of place recognition performance. For concrete validation, we\ntest a single session from the MulRan dataset and a multi-session from the\nOxford Offroad Radar, Oxford Radar RobotCar, and the Boreas dataset.\n","authors":["Byunghee Choi","Hogyun Kim","Younggun Cho"],"pdf_url":"https://arxiv.org/pdf/2403.14176v2.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.06732v1","updated":"2024-04-10T04:36:24Z","published":"2024-04-10T04:36:24Z","title":"Enhancing Safety in Mixed Traffic: Learning-Based Modeling and Efficient\n Control of Autonomous and Human-Driven Vehicles","summary":" With the increasing presence of autonomous vehicles (AVs) on public roads,\ndeveloping robust control strategies to navigate the uncertainty of\nhuman-driven vehicles (HVs) is crucial. This paper introduces an advanced\nmethod for modeling HV behavior, combining a first-principles model with\nGaussian process (GP) learning to enhance velocity prediction accuracy and\nprovide a measurable uncertainty. We validated this innovative HV model using\nreal-world data from field experiments and applied it to develop a GP-enhanced\nmodel predictive control (GP-MPC) strategy. This strategy aims to improve\nsafety in mixed vehicle platoons by integrating uncertainty assessment into\ndistance constraints. Comparative simulation studies with a conventional model\npredictive control (MPC) approach demonstrated that our GP-MPC strategy ensures\nmore reliable safe distancing and fosters efficient vehicular dynamics,\nachieving notably higher speeds within the platoon. By incorporating a sparse\nGP technique in HV modeling and adopting a dynamic GP prediction within the MPC\nframework, we significantly reduced the computation time of GP-MPC, marking it\nonly 4.6% higher than that of the conventional MPC. This represents a\nsubstantial improvement, making the process about 100 times faster than our\npreliminary work without these approximations. Our findings underscore the\neffectiveness of learning-based HV modeling in enhancing both safety and\noperational efficiency in mixed-traffic environments, paving the way for more\nharmonious AV-HV interactions.\n","authors":["Jie Wang","Yash Vardhan Pant","Lei Zhao","Michał Antkiewicz","Krzysztof Czarnecki"],"pdf_url":"https://arxiv.org/pdf/2404.06732v1.pdf","comment":"in IEEE Transactions on Intelligent Transportation Systems, 2024"},{"id":"http://arxiv.org/abs/2404.06728v1","updated":"2024-04-10T04:25:41Z","published":"2024-04-10T04:25:41Z","title":"A Data Efficient Framework for Learning Local Heuristics","summary":" With the advent of machine learning, there have been several recent attempts\nto learn effective and generalizable heuristics. Local Heuristic A* (LoHA*) is\none recent method that instead of learning the entire heuristic estimate,\nlearns a \"local\" residual heuristic that estimates the cost to escape a region\n(Veerapaneni et al 2023). LoHA*, like other supervised learning methods,\ncollects a dataset of target values by querying an oracle on many planning\nproblems (in this case, local planning problems). This data collection process\ncan become slow as the size of the local region increases or if the domain\nrequires expensive collision checks. Our main insight is that when an A* search\nsolves a start-goal planning problem it inherently ends up solving multiple\nlocal planning problems. We exploit this observation to propose an efficient\ndata collection framework that does <1/10th the amount of work (measured by\nexpansions) to collect the same amount of data in comparison to baselines. This\nidea also enables us to run LoHA* in an online manner where we can iteratively\ncollect data and improve our model while solving relevant start-goal tasks. We\ndemonstrate the performance of our data collection and online framework on a 4D\n$(x, y, \\theta, v)$ navigation domain.\n","authors":["Rishi Veerapaneni","Jonathan Park","Muhammad Suhail Saleem","Maxim Likhachev"],"pdf_url":"https://arxiv.org/pdf/2404.06728v1.pdf","comment":"Accepted in the 17th International Symposium on Combinatorial Search\n (SoCS 2024)"},{"id":"http://arxiv.org/abs/2403.01710v2","updated":"2024-04-10T04:21:27Z","published":"2024-03-04T03:58:26Z","title":"Sensor-based Multi-Robot Coverage Control with Spatial Separation in\n Unstructured Environments","summary":" Multi-robot systems have increasingly become instrumental in tackling search\nand coverage problems. However, the challenge of optimizing task efficiency\nwithout compromising task success still persists, particularly in expansive,\nunstructured environments with dense obstacles.\n This paper presents an innovative, decentralized Voronoi-based approach for\nsearch and coverage to reactively navigate these complexities while maintaining\nsafety.\n This approach leverages the active sensing capabilities of multi-robot\nsystems to supplement GIS (Geographic Information System), offering a more\ncomprehensive and real-time understanding of the environment. Based on point\ncloud data, which is inherently non-convex and unstructured, this method\nefficiently generates collision-free Voronoi regions using only local sensing\ninformation through spatial decomposition and spherical mirroring techniques.\n Then, deadlock-aware guided map integrated with a gradient-optimized,\ncentroid Voronoi-based coverage control policy, is constructed to improve\nefficiency by avoiding exhaustive searches and local sensing pitfalls.\n The effectiveness of our algorithm has been validated through extensive\nnumerical simulations in high-fidelity environments, demonstrating significant\nimprovements in both task success rate, coverage ratio, and task execution time\ncompared with others.\n","authors":["Xinyi Wang","Jiwen Xu","Chuanxiang Gao","Yizhou Chen","Jihan Zhang","Chenggang Wang","Ben M. Chen"],"pdf_url":"https://arxiv.org/pdf/2403.01710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08604v2","updated":"2024-04-10T03:29:32Z","published":"2023-12-14T02:03:36Z","title":"Verification of Neural Reachable Tubes via Scenario Optimization and\n Conformal Prediction","summary":" Learning-based approaches for controlling safety-critical systems are rapidly\ngrowing in popularity; thus, it is important to assure their performance and\nsafety. Hamilton-Jacobi (HJ) reachability analysis is a popular formal\nverification tool for providing such guarantees, since it can handle general\nnonlinear system dynamics, bounded adversarial system disturbances, and state\nand input constraints. However, its computational and memory complexity scales\nexponentially with the state dimension, making it intractable for large-scale\nsystems. To overcome this challenge, neural approaches, such as DeepReach, have\nbeen used to synthesize reachable tubes and safety controllers for\nhigh-dimensional systems. However, verifying these neural reachable tubes\nremains challenging. In this work, we propose two verification methods, based\non robust scenario optimization and conformal prediction, to provide\nprobabilistic safety guarantees for neural reachable tubes. Our methods allow a\ndirect trade-off between resilience to outlier errors in the neural tube, which\nare inevitable in a learning-based approach, and the strength of the\nprobabilistic safety guarantee. Furthermore, we show that split conformal\nprediction, a widely used method in the machine learning community for\nuncertainty quantification, reduces to a scenario-based approach, making the\ntwo methods equivalent not only for verification of neural reachable tubes but\nalso more generally. To our knowledge, our proof is the first in the literature\nto show a strong relationship between conformal prediction and scenario\noptimization. Finally, we propose an outlier-adjusted verification approach\nthat uses the error distribution in neural reachable tubes to recover greater\nsafe volumes. We demonstrate the efficacy of the proposed approaches for the\nhigh-dimensional problems of multi-vehicle collision avoidance and rocket\nlanding with no-go zones.\n","authors":["Albert Lin","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2312.08604v2.pdf","comment":"Accepted to 6th Annual Learning for Dynamics & Control Conference.\n arXiv admin note: text overlap with arXiv:2209.12336"},{"id":"http://arxiv.org/abs/2309.05645v2","updated":"2024-04-10T03:05:04Z","published":"2023-09-11T17:37:08Z","title":"CitDet: A Benchmark Dataset for Citrus Fruit Detection","summary":" In this letter, we present a new dataset to advance the state of the art in\ndetecting citrus fruit and accurately estimate yield on trees affected by the\nHuanglongbing (HLB) disease in orchard environments via imaging. Despite the\nfact that significant progress has been made in solving the fruit detection\nproblem, the lack of publicly available datasets has complicated direct\ncomparison of results. For instance, citrus detection has long been of interest\nto the agricultural research community, yet there is an absence of work,\nparticularly involving public datasets of citrus affected by HLB. To address\nthis issue, we enhance state-of-the-art object detection methods for use in\ntypical orchard settings. Concretely, we provide high-resolution images of\ncitrus trees located in an area known to be highly affected by HLB, along with\nhigh-quality bounding box annotations of citrus fruit. Fruit on both the trees\nand the ground are labeled to allow for identification of fruit location, which\ncontributes to advancements in yield estimation and potential measure of HLB\nimpact via fruit drop. The dataset consists of over 32,000 bounding box\nannotations for fruit instances contained in 579 high-resolution images. In\nsummary, our contributions are the following: (i) we introduce a novel dataset\nalong with baseline performance benchmarks on multiple contemporary object\ndetection algorithms, (ii) we show the ability to accurately capture fruit\nlocation on tree or on ground, and finally (ii) we present a correlation of our\nresults with yield estimations.\n","authors":["Jordan A. James","Heather K. Manching","Matthew R. Mattia","Kim D. Bowman","Amanda M. Hulse-Kemp","William J. Beksi"],"pdf_url":"https://arxiv.org/pdf/2309.05645v2.pdf","comment":"Submitted to IEEE Robotics and Automation Letters (RA-L)"},{"id":"http://arxiv.org/abs/2404.06687v1","updated":"2024-04-10T02:22:38Z","published":"2024-04-10T02:22:38Z","title":"Fast and Accurate Relative Motion Tracking for Two Industrial Robots","summary":" Industrial robotic applications such as spraying, welding, and additive\nmanufacturing frequently require fast, accurate, and uniform motion along a 3D\nspatial curve. To increase process throughput, some manufacturers propose a\ndual-robot setup to overcome the speed limitation of a single robot. Industrial\nrobot motion is programmed through waypoints connected by motion primitives\n(Cartesian linear and circular paths and linear joint paths at constant\nCartesian speed). The actual robot motion is affected by the blending between\nthese motion primitives and the pose of the robot (an outstretched/close to\nsingularity pose tends to have larger path-tracking errors). Choosing the\nwaypoints and the speed along each motion segment to achieve the performance\nrequirement is challenging. At present, there is no automated solution, and\nlaborious manual tuning by robot experts is needed to approach the desired\nperformance. In this paper, we present a systematic three-step approach to\ndesigning and programming a dual-robot system to optimize system performance.\nThe first step is to select the relative placement between the two robots based\non the specified relative motion path. The second step is to select the\nrelative waypoints and the motion primitives. The final step is to update the\nwaypoints iteratively based on the actual relative motion. Waypoint iteration\nis first executed in simulation and then completed using the actual robots. For\nperformance measures, we use the mean path speed subject to the relative\nposition and orientation constraints and the path speed uniformity constraint.\nWe have demonstrated the effectiveness of this method with ABB and FANUC robots\non two challenging test curves. The performance improvement over the current\nindustrial practice baseline is over 300%. Compared to the optimized single-arm\ncase that we have previously reported, the improvement is over 14%.\n","authors":["Honglu He","Chen-lung Lu","Glenn Saunders","Pinghai Yang","Jeffrey Schoonover","John Wason","Santiago Paternain","Agung Julius","John T. Wen"],"pdf_url":"https://arxiv.org/pdf/2404.06687v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.07206v1","updated":"2024-04-10T17:59:59Z","published":"2024-04-10T17:59:59Z","title":"GoodDrag: Towards Good Practices for Drag Editing with Diffusion Models","summary":" In this paper, we introduce GoodDrag, a novel approach to improve the\nstability and image quality of drag editing. Unlike existing methods that\nstruggle with accumulated perturbations and often result in distortions,\nGoodDrag introduces an AlDD framework that alternates between drag and\ndenoising operations within the diffusion process, effectively improving the\nfidelity of the result. We also propose an information-preserving motion\nsupervision operation that maintains the original features of the starting\npoint for precise manipulation and artifact reduction. In addition, we\ncontribute to the benchmarking of drag editing by introducing a new dataset,\nDrag100, and developing dedicated quality assessment metrics, Dragging Accuracy\nIndex and Gemini Score, utilizing Large Multimodal Models. Extensive\nexperiments demonstrate that the proposed GoodDrag compares favorably against\nthe state-of-the-art approaches both qualitatively and quantitatively. The\nproject page is https://gooddrag.github.io.\n","authors":["Zewei Zhang","Huan Liu","Jun Chen","Xiangyu Xu"],"pdf_url":"https://arxiv.org/pdf/2404.07206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07204v1","updated":"2024-04-10T17:59:45Z","published":"2024-04-10T17:59:45Z","title":"BRAVE: Broadening the visual encoding of vision-language models","summary":" Vision-language models (VLMs) are typically composed of a vision encoder,\ne.g. CLIP, and a language model (LM) that interprets the encoded features to\nsolve downstream tasks. Despite remarkable progress, VLMs are subject to\nseveral shortcomings due to the limited capabilities of vision encoders, e.g.\n\"blindness\" to certain image features, visual hallucination, etc. To address\nthese issues, we study broadening the visual encoding capabilities of VLMs. We\nfirst comprehensively benchmark several vision encoders with different\ninductive biases for solving VLM tasks. We observe that there is no single\nencoding configuration that consistently achieves top performance across\ndifferent tasks, and encoders with different biases can perform surprisingly\nsimilarly. Motivated by this, we introduce a method, named BRAVE, that\nconsolidates features from multiple frozen encoders into a more versatile\nrepresentation that can be directly fed as the input to a frozen LM. BRAVE\nachieves state-of-the-art performance on a broad range of captioning and VQA\nbenchmarks and significantly reduces the aforementioned issues of VLMs, while\nrequiring a smaller number of trainable parameters than existing methods and\nhaving a more compressed representation. Our results highlight the potential of\nincorporating different visual biases for a more broad and contextualized\nvisual understanding of VLMs.\n","authors":["Oğuzhan Fatih Kar","Alessio Tonioni","Petra Poklukar","Achin Kulshrestha","Amir Zamir","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2404.07204v1.pdf","comment":"Project page at https://brave-vlms.epfl.ch/"},{"id":"http://arxiv.org/abs/2404.07202v1","updated":"2024-04-10T17:59:20Z","published":"2024-04-10T17:59:20Z","title":"UMBRAE: Unified Multimodal Decoding of Brain Signals","summary":" We address prevailing challenges of the brain-powered research, departing\nfrom the observation that the literature hardly recover accurate spatial\ninformation and require subject-specific models. To address these challenges,\nwe propose UMBRAE, a unified multimodal decoding of brain signals. First, to\nextract instance-level conceptual and spatial details from neural signals, we\nintroduce an efficient universal brain encoder for multimodal-brain alignment\nand recover object descriptions at multiple levels of granularity from\nsubsequent multimodal large language model (MLLM). Second, we introduce a\ncross-subject training strategy mapping subject-specific features to a common\nfeature space. This allows a model to be trained on multiple subjects without\nextra resources, even yielding superior results compared to subject-specific\nmodels. Further, we demonstrate this supports weakly-supervised adaptation to\nnew subjects, with only a fraction of the total training data. Experiments\ndemonstrate that UMBRAE not only achieves superior results in the newly\nintroduced tasks but also outperforms methods in well established tasks. To\nassess our method, we construct and share with the community a comprehensive\nbrain understanding benchmark BrainHub. Our code and benchmark are available at\nhttps://weihaox.github.io/UMBRAE.\n","authors":["Weihao Xia","Raoul de Charette","Cengiz Öztireli","Jing-Hao Xue"],"pdf_url":"https://arxiv.org/pdf/2404.07202v1.pdf","comment":"Project Page: https://weihaox.github.io/UMBRAE"},{"id":"http://arxiv.org/abs/2404.07199v1","updated":"2024-04-10T17:57:41Z","published":"2024-04-10T17:57:41Z","title":"RealmDreamer: Text-Driven 3D Scene Generation with Inpainting and Depth\n Diffusion","summary":" We introduce RealmDreamer, a technique for generation of general\nforward-facing 3D scenes from text descriptions. Our technique optimizes a 3D\nGaussian Splatting representation to match complex text prompts. We initialize\nthese splats by utilizing the state-of-the-art text-to-image generators,\nlifting their samples into 3D, and computing the occlusion volume. We then\noptimize this representation across multiple views as a 3D inpainting task with\nimage-conditional diffusion models. To learn correct geometric structure, we\nincorporate a depth diffusion model by conditioning on the samples from the\ninpainting model, giving rich geometric structure. Finally, we finetune the\nmodel using sharpened samples from image generators. Notably, our technique\ndoes not require video or multi-view data and can synthesize a variety of\nhigh-quality 3D scenes in different styles, consisting of multiple objects. Its\ngenerality additionally allows 3D synthesis from a single image.\n","authors":["Jaidev Shriram","Alex Trevithick","Lingjie Liu","Ravi Ramamoorthi"],"pdf_url":"https://arxiv.org/pdf/2404.07199v1.pdf","comment":"Project Page: https://realmdreamer.github.io/"},{"id":"http://arxiv.org/abs/2404.07191v1","updated":"2024-04-10T17:48:37Z","published":"2024-04-10T17:48:37Z","title":"InstantMesh: Efficient 3D Mesh Generation from a Single Image with\n Sparse-view Large Reconstruction Models","summary":" We present InstantMesh, a feed-forward framework for instant 3D mesh\ngeneration from a single image, featuring state-of-the-art generation quality\nand significant training scalability. By synergizing the strengths of an\noff-the-shelf multiview diffusion model and a sparse-view reconstruction model\nbased on the LRM architecture, InstantMesh is able to create diverse 3D assets\nwithin 10 seconds. To enhance the training efficiency and exploit more\ngeometric supervisions, e.g, depths and normals, we integrate a differentiable\niso-surface extraction module into our framework and directly optimize on the\nmesh representation. Experimental results on public datasets demonstrate that\nInstantMesh significantly outperforms other latest image-to-3D baselines, both\nqualitatively and quantitatively. We release all the code, weights, and demo of\nInstantMesh, with the intention that it can make substantial contributions to\nthe community of 3D generative AI and empower both researchers and content\ncreators.\n","authors":["Jiale Xu","Weihao Cheng","Yiming Gao","Xintao Wang","Shenghua Gao","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2404.07191v1.pdf","comment":"Technical report. Project: https://github.com/TencentARC/InstantMesh"},{"id":"http://arxiv.org/abs/2404.07188v1","updated":"2024-04-10T17:41:41Z","published":"2024-04-10T17:41:41Z","title":"GCV-Turbo: End-to-end Acceleration of GNN-based Computer Vision Tasks on\n FPGA","summary":" Graph neural networks (GNNs) have recently empowered various novel computer\nvision (CV) tasks. In GNN-based CV tasks, a combination of CNN layers and GNN\nlayers or only GNN layers are employed. This paper introduces GCV-Turbo, a\ndomain-specific accelerator on FPGA for end-to-end acceleration of GNN-based CV\ntasks. GCV-Turbo consists of two key components: (1) a \\emph{novel} hardware\narchitecture optimized for the computation kernels in both CNNs and GNNs using\nthe same set of computation resources. (2) a PyTorch-compatible compiler that\ntakes a user-defined model as input, performs end-to-end optimization for the\ncomputation graph of a given GNN-based CV task, and produces optimized code for\nhardware execution. The hardware architecture and the compiler work\nsynergistically to support a variety of GNN-based CV tasks. We implement\nGCV-Turbo on a state-of-the-art FPGA and evaluate its performance across six\nrepresentative GNN-based CV tasks with diverse input data modalities (e.g.,\nimage, human skeleton, point cloud). Compared with state-of-the-art CPU (GPU)\nimplementations, GCV-Turbo achieves an average latency reduction of\n$68.4\\times$ ($4.1\\times$) on these six GNN-based CV tasks. Moreover, GCV-Turbo\nsupports the execution of the standalone CNNs or GNNs, achieving performance\ncomparable to that of state-of-the-art CNN (GNN) accelerators for widely used\nCNN-only (GNN-only) models.\n","authors":["Bingyi Zhang","Rajgopal Kannan","Carl Busart","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2404.07188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.14855v2","updated":"2024-04-10T17:35:16Z","published":"2022-12-30T18:04:25Z","title":"Disentangled Explanations of Neural Network Predictions by Finding\n Relevant Subspaces","summary":" Explainable AI aims to overcome the black-box nature of complex ML models\nlike neural networks by generating explanations for their predictions.\nExplanations often take the form of a heatmap identifying input features (e.g.\npixels) that are relevant to the model's decision. These explanations, however,\nentangle the potentially multiple factors that enter into the overall complex\ndecision strategy. We propose to disentangle explanations by extracting at some\nintermediate layer of a neural network, subspaces that capture the multiple and\ndistinct activation patterns (e.g. visual concepts) that are relevant to the\nprediction. To automatically extract these subspaces, we propose two new\nanalyses, extending principles found in PCA or ICA to explanations. These novel\nanalyses, which we call principal relevant component analysis (PRCA) and\ndisentangled relevant subspace analysis (DRSA), maximize relevance instead of\ne.g. variance or kurtosis. This allows for a much stronger focus of the\nanalysis on what the ML model actually uses for predicting, ignoring\nactivations or concepts to which the model is invariant. Our approach is\ngeneral enough to work alongside common attribution techniques such as Shapley\nValue, Integrated Gradients, or LRP. Our proposed methods show to be\npractically useful and compare favorably to the state of the art as\ndemonstrated on benchmarks and three use cases.\n","authors":["Pattarawat Chormai","Jan Herrmann","Klaus-Robert Müller","Grégoire Montavon"],"pdf_url":"https://arxiv.org/pdf/2212.14855v2.pdf","comment":"17 pages + supplement"},{"id":"http://arxiv.org/abs/2404.07178v1","updated":"2024-04-10T17:28:16Z","published":"2024-04-10T17:28:16Z","title":"Move Anything with Layered Scene Diffusion","summary":" Diffusion models generate images with an unprecedented level of quality, but\nhow can we freely rearrange image layouts? Recent works generate controllable\nscenes via learning spatially disentangled latent codes, but these methods do\nnot apply to diffusion models due to their fixed forward process. In this work,\nwe propose SceneDiffusion to optimize a layered scene representation during the\ndiffusion sampling process. Our key insight is that spatial disentanglement can\nbe obtained by jointly denoising scene renderings at different spatial layouts.\nOur generated scenes support a wide range of spatial editing operations,\nincluding moving, resizing, cloning, and layer-wise appearance editing\noperations, including object restyling and replacing. Moreover, a scene can be\ngenerated conditioned on a reference image, thus enabling object moving for\nin-the-wild images. Notably, this approach is training-free, compatible with\ngeneral text-to-image diffusion models, and responsive in less than a second.\n","authors":["Jiawei Ren","Mengmeng Xu","Jui-Chieh Wu","Ziwei Liu","Tao Xiang","Antoine Toisoul"],"pdf_url":"https://arxiv.org/pdf/2404.07178v1.pdf","comment":"CVPR 2024 camera-ready"},{"id":"http://arxiv.org/abs/2404.07176v1","updated":"2024-04-10T17:25:42Z","published":"2024-04-10T17:25:42Z","title":"Self-supervised Monocular Depth Estimation on Water Scenes via Specular\n Reflection Prior","summary":" Monocular depth estimation from a single image is an ill-posed problem for\ncomputer vision due to insufficient reliable cues as the prior knowledge.\nBesides the inter-frame supervision, namely stereo and adjacent frames,\nextensive prior information is available in the same frame. Reflections from\nspecular surfaces, informative intra-frame priors, enable us to reformulate the\nill-posed depth estimation task as a multi-view synthesis. This paper proposes\nthe first self-supervision for deep-learning depth estimation on water scenes\nvia intra-frame priors, known as reflection supervision and geometrical\nconstraints. In the first stage, a water segmentation network is performed to\nseparate the reflection components from the entire image. Next, we construct a\nself-supervised framework to predict the target appearance from reflections,\nperceived as other perspectives. The photometric re-projection error,\nincorporating SmoothL1 and a novel photometric adaptive SSIM, is formulated to\noptimize pose and depth estimation by aligning the transformed virtual depths\nand source ones. As a supplement, the water surface is determined from real and\nvirtual camera positions, which complement the depth of the water area.\nFurthermore, to alleviate these laborious ground truth annotations, we\nintroduce a large-scale water reflection scene (WRS) dataset rendered from\nUnreal Engine 4. Extensive experiments on the WRS dataset prove the feasibility\nof the proposed method compared to state-of-the-art depth estimation\ntechniques.\n","authors":["Zhengyang Lu","Ying Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07176v1.pdf","comment":"16 pages, 8 figures"},{"id":"http://arxiv.org/abs/2212.11120v2","updated":"2024-04-10T17:15:23Z","published":"2022-12-10T07:50:29Z","title":"Deep Learning for Inertial Sensor Alignment","summary":" Accurate alignment of a fixed mobile device equipped with inertial sensors\ninside a moving vehicle is important for navigation, activity recognition, and\nother applications. Accurate estimation of the device mounting angle is\nrequired to rotate the inertial measurement from the sensor frame to the moving\nplatform frame to standardize measurements and improve the performance of the\ntarget task. In this work, a data-driven approach using deep neural networks\n(DNNs) is proposed to learn the yaw mounting angle of a smartphone equipped\nwith an inertial measurement unit (IMU) and strapped to a car. The proposed\nmodel uses only the accelerometer and gyroscope readings from an IMU as input\nand, in contrast to existing solutions, does not require global position inputs\nfrom global navigation satellite systems (GNSS). To train the model in a\nsupervised manner, IMU data is collected for training and validation with the\nsensor mounted at a known yaw mounting angle, and a range of ground truth\nlabels is generated by applying a random rotation in a bounded range to the\nmeasurements. The trained model is tested on data with real rotations showing\nsimilar performance as with synthetic rotations. The trained model is deployed\non an Android device and evaluated in real-time to test the accuracy of the\nestimated yaw mounting angle. The model is shown to find the mounting angle at\nan accuracy of 8 degrees within 5 seconds, and 4 degrees within 27 seconds. An\nexperiment is conducted to compare the proposed model with an existing\noff-the-shelf solution.\n","authors":["Maxim Freydin","Niv Sfaradi","Nimrod Segol","Areej Eweida","Barak Or"],"pdf_url":"https://arxiv.org/pdf/2212.11120v2.pdf","comment":"9 Pages, Preprint. Accepted IEEE"},{"id":"http://arxiv.org/abs/2404.07155v1","updated":"2024-04-10T16:44:11Z","published":"2024-04-10T16:44:11Z","title":"Unified Language-driven Zero-shot Domain Adaptation","summary":" This paper introduces Unified Language-driven Zero-shot Domain Adaptation\n(ULDA), a novel task setting that enables a single model to adapt to diverse\ntarget domains without explicit domain-ID knowledge. We identify the\nconstraints in the existing language-driven zero-shot domain adaptation task,\nparticularly the requirement for domain IDs and domain-specific models, which\nmay restrict flexibility and scalability. To overcome these issues, we propose\na new framework for ULDA, consisting of Hierarchical Context Alignment (HCA),\nDomain Consistent Representation Learning (DCRL), and Text-Driven Rectifier\n(TDR). These components work synergistically to align simulated features with\ntarget text across multiple visual levels, retain semantic correlations between\ndifferent regional representations, and rectify biases between simulated and\nreal target visual features, respectively. Our extensive empirical evaluations\ndemonstrate that this framework achieves competitive performance in both\nsettings, surpassing even the model that requires domain-ID, showcasing its\nsuperiority and generalization ability. The proposed method is not only\neffective but also maintains practicality and efficiency, as it does not\nintroduce additional computational costs during inference. Our project page is\nhttps://senqiaoyang.com/project/ULDA .\n","authors":["Senqiao Yang","Zhuotao Tian","Li Jiang","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2404.07155v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07153v1","updated":"2024-04-10T16:39:50Z","published":"2024-04-10T16:39:50Z","title":"Lost in Translation: Modern Neural Networks Still Struggle With Small\n Realistic Image Transformations","summary":" Deep neural networks that achieve remarkable performance in image\nclassification have previously been shown to be easily fooled by tiny\ntransformations such as a one pixel translation of the input image. In order to\naddress this problem, two approaches have been proposed in recent years. The\nfirst approach suggests using huge datasets together with data augmentation in\nthe hope that a highly varied training set will teach the network to learn to\nbe invariant. The second approach suggests using architectural modifications\nbased on sampling theory to deal explicitly with image translations. In this\npaper, we show that these approaches still fall short in robustly handling\n'natural' image translations that simulate a subtle change in camera\norientation. Our findings reveal that a mere one-pixel translation can result\nin a significant change in the predicted image representation for approximately\n40% of the test images in state-of-the-art models (e.g. open-CLIP trained on\nLAION-2B or DINO-v2) , while models that are explicitly constructed to be\nrobust to cyclic translations can still be fooled with 1 pixel realistic\n(non-cyclic) translations 11% of the time. We present Robust Inference by Crop\nSelection: a simple method that can be proven to achieve any desired level of\nconsistency, although with a modest tradeoff with the model's accuracy.\nImportantly, we demonstrate how employing this method reduces the ability to\nfool state-of-the-art models with a 1 pixel translation to less than 5% while\nsuffering from only a 1% drop in classification accuracy. Additionally, we show\nthat our method can be easy adjusted to deal with circular shifts as well. In\nsuch case we achieve 100% robustness to integer shifts with state-of-the-art\naccuracy, and with no need for any further training.\n","authors":["Ofir Shifman","Yair Weiss"],"pdf_url":"https://arxiv.org/pdf/2404.07153v1.pdf","comment":"14 pages, 6 appendices, 17 figures"},{"id":"http://arxiv.org/abs/2312.00068v2","updated":"2024-04-10T16:04:48Z","published":"2023-11-29T20:59:00Z","title":"GLiDR: Topologically Regularized Graph Generative Network for Sparse\n LiDAR Point Clouds","summary":" Sparse LiDAR point clouds cause severe loss of detail of static structures\nand reduce the density of static points available for navigation. Reduced\ndensity can be detrimental to navigation under several scenarios. We observe\nthat despite high sparsity, in most cases, the global topology of LiDAR\noutlining the static structures can be inferred. We utilize this property to\nobtain a backbone skeleton of a LiDAR scan in the form of a single connected\ncomponent that is a proxy to its global topology. We utilize the backbone to\naugment new points along static structures to overcome sparsity. Newly\nintroduced points could correspond to existing static structures or to static\npoints that were earlier obstructed by dynamic objects. To the best of our\nknowledge, we are the first to use such a strategy for sparse LiDAR point\nclouds. Existing solutions close to our approach fail to identify and preserve\nthe global static LiDAR topology and generate sub-optimal points. We propose\nGLiDR, a Graph Generative network that is topologically regularized using\n0-dimensional Persistent Homology ($\\mathcal{PH}$) constraints. This enables\nGLiDR to introduce newer static points along a topologically consistent global\nstatic LiDAR backbone. GLiDR generates precise static points using $32\\times$\nsparser dynamic scans and performs better than the baselines across three\ndatasets. GLiDR generates a valuable byproduct - an accurate binary\nsegmentation mask of static and dynamic objects that are helpful for navigation\nplanning and safety in constrained environments. The newly introduced static\npoints allow GLiDR to outperform LiDAR-based navigation using SLAM in several\nsettings. Source code is available at\n$\\texttt{https://github.com/GLiDR-CVPR2024/GLiDR}$.\n","authors":["Prashant Kumar","Kshitij Madhav Bhat","Vedang Bhupesh Shenvi Nadkarni","Prem Kalra"],"pdf_url":"https://arxiv.org/pdf/2312.00068v2.pdf","comment":"IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR)"},{"id":"http://arxiv.org/abs/2404.07124v1","updated":"2024-04-10T16:04:21Z","published":"2024-04-10T16:04:21Z","title":"Measuring proximity to standard planes during fetal brain ultrasound\n scanning","summary":" This paper introduces a novel pipeline designed to bring ultrasound (US)\nplane pose estimation closer to clinical use for more effective navigation to\nthe standard planes (SPs) in the fetal brain. We propose a semi-supervised\nsegmentation model utilizing both labeled SPs and unlabeled 3D US volume\nslices. Our model enables reliable segmentation across a diverse set of fetal\nbrain images. Furthermore, the model incorporates a classification mechanism to\nidentify the fetal brain precisely. Our model not only filters out frames\nlacking the brain but also generates masks for those containing it, enhancing\nthe relevance of plane pose regression in clinical settings. We focus on fetal\nbrain navigation from 2D ultrasound (US) video analysis and combine this model\nwith a US plane pose regression network to provide sensorless proximity\ndetection to SPs and non-SPs planes; we emphasize the importance of proximity\ndetection to SPs for guiding sonographers, offering a substantial advantage\nover traditional methods by allowing earlier and more precise adjustments\nduring scanning. We demonstrate the practical applicability of our approach\nthrough validation on real fetal scan videos obtained from sonographers of\nvarying expertise levels. Our findings demonstrate the potential of our\napproach to complement existing fetal US technologies and advance prenatal\ndiagnostic practices.\n","authors":["Chiara Di Vece","Antonio Cirigliano","Meala Le Lous","Raffaele Napolitano","Anna L. David","Donald Peebles","Pierre Jannin","Francisco Vasconcelos","Danail Stoyanov"],"pdf_url":"https://arxiv.org/pdf/2404.07124v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.07122v1","updated":"2024-04-10T16:01:37Z","published":"2024-04-10T16:01:37Z","title":"Driver Attention Tracking and Analysis","summary":" We propose a novel method to estimate a driver's points-of-gaze using a pair\nof ordinary cameras mounted on the windshield and dashboard of a car. This is a\nchallenging problem due to the dynamics of traffic environments with 3D scenes\nof unknown depths. This problem is further complicated by the volatile distance\nbetween the driver and the camera system. To tackle these challenges, we\ndevelop a novel convolutional network that simultaneously analyzes the image of\nthe scene and the image of the driver's face. This network has a camera\ncalibration module that can compute an embedding vector that represents the\nspatial configuration between the driver and the camera system. This\ncalibration module improves the overall network's performance, which can be\njointly trained end to end.\n We also address the lack of annotated data for training and evaluation by\nintroducing a large-scale driving dataset with point-of-gaze annotations. This\nis an in situ dataset of real driving sessions in an urban city, containing\nsynchronized images of the driving scene as well as the face and gaze of the\ndriver. Experiments on this dataset show that the proposed method outperforms\nvarious baseline methods, having the mean prediction error of 29.69 pixels,\nwhich is relatively small compared to the $1280{\\times}720$ resolution of the\nscene camera.\n","authors":["Dat Viet Thanh Nguyen","Anh Tran","Nam Vu","Cuong Pham","Minh Hoai"],"pdf_url":"https://arxiv.org/pdf/2404.07122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10908v3","updated":"2024-04-10T15:59:31Z","published":"2023-12-18T03:34:07Z","title":"CLOVA: A Closed-Loop Visual Assistant with Tool Usage and Update","summary":" Utilizing large language models (LLMs) to compose off-the-shelf visual tools\nrepresents a promising avenue of research for developing robust visual\nassistants capable of addressing diverse visual tasks. However, these methods\noften overlook the potential for continual learning, typically by freezing the\nutilized tools, thus limiting their adaptation to environments requiring new\nknowledge. To tackle this challenge, we propose CLOVA, a Closed-Loop Visual\nAssistant, which operates within a framework encompassing inference,\nreflection, and learning phases. During the inference phase, LLMs generate\nprograms and execute corresponding tools to complete assigned tasks. In the\nreflection phase, a multimodal global-local reflection scheme analyzes human\nfeedback to determine which tools require updating. Lastly, the learning phase\nemploys three flexible approaches to automatically gather training data and\nintroduces a novel prompt tuning scheme to update the tools, allowing CLOVA to\nefficiently acquire new knowledge. Experimental findings demonstrate that CLOVA\nsurpasses existing tool-usage methods by 5% in visual question answering and\nmultiple-image reasoning, by 10% in knowledge tagging, and by 20% in image\nediting. These results underscore the significance of the continual learning\ncapability in general visual assistants.\n","authors":["Zhi Gao","Yuntao Du","Xintong Zhang","Xiaojian Ma","Wenjuan Han","Song-Chun Zhu","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2312.10908v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.11468v3","updated":"2024-04-10T15:58:09Z","published":"2023-11-13T20:41:48Z","title":"Bias-Reduced Neural Networks for Parameter Estimation in Quantitative\n MRI","summary":" Purpose: To develop neural network (NN)-based quantitative MRI parameter\nestimators with minimal bias and a variance close to the Cram\\'er-Rao bound.\n Theory and Methods: We generalize the mean squared error loss to control the\nbias and variance of the NN's estimates, which involves averaging over multiple\nnoise realizations of the same measurements during training. Bias and variance\nproperties of the resulting NNs are studied for two neuroimaging applications.\n Results: In simulations, the proposed strategy reduces the estimates' bias\nthroughout parameter space and achieves a variance close to the Cram\\'er-Rao\nbound. In vivo, we observe good concordance between parameter maps estimated\nwith the proposed NNs and traditional estimators, such as non-linear\nleast-squares fitting, while state-of-the-art NNs show larger deviations.\n Conclusion: The proposed NNs have greatly reduced bias compared to those\ntrained using the mean squared error and offer significantly improved\ncomputational efficiency over traditional estimators with comparable or better\naccuracy.\n","authors":["Andrew Mao","Sebastian Flassbeck","Jakob Assländer"],"pdf_url":"https://arxiv.org/pdf/2312.11468v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07112v1","updated":"2024-04-10T15:51:46Z","published":"2024-04-10T15:51:46Z","title":"Unfolding ADMM for Enhanced Subspace Clustering of Hyperspectral Images","summary":" Deep subspace clustering methods are now prominent in clustering, typically\nusing fully connected networks and a self-representation loss function.\nHowever, these methods often struggle with overfitting and lack\ninterpretability. In this paper, we explore an alternative clustering approach\nbased on deep unfolding. By unfolding iterative optimization methods into\nneural networks, this approach offers enhanced interpretability and reliability\ncompared to data-driven deep learning methods, and greater adaptability and\ngeneralization than model-based approaches. Hence, unfolding has become widely\nused in inverse imaging problems, such as image restoration, reconstruction,\nand super-resolution, but has not been sufficiently explored yet in the context\nof clustering. In this work, we introduce an innovative clustering architecture\nfor hyperspectral images (HSI) by unfolding an iterative solver based on the\nAlternating Direction Method of Multipliers (ADMM) for sparse subspace\nclustering. To our knowledge, this is the first attempt to apply unfolding ADMM\nfor computing the self-representation matrix in subspace clustering. Moreover,\nour approach captures well the structural characteristics of HSI data by\nemploying the K nearest neighbors algorithm as part of a structure preservation\nmodule. Experimental evaluation of three established HSI datasets shows clearly\nthe potential of the unfolding approach in HSI clustering and even demonstrates\nsuperior performance compared to state-of-the-art techniques.\n","authors":["Xianlu Li","Nicolas Nadisic","Shaoguang Huang","Aleksandra Pižurica"],"pdf_url":"https://arxiv.org/pdf/2404.07112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07110v1","updated":"2024-04-10T15:47:35Z","published":"2024-04-10T15:47:35Z","title":"Wild Visual Navigation: Fast Traversability Learning via Pre-Trained\n Models and Online Self-Supervision","summary":" Natural environments such as forests and grasslands are challenging for\nrobotic navigation because of the false perception of rigid obstacles from high\ngrass, twigs, or bushes. In this work, we present Wild Visual Navigation (WVN),\nan online self-supervised learning system for visual traversability estimation.\nThe system is able to continuously adapt from a short human demonstration in\nthe field, only using onboard sensing and computing. One of the key ideas to\nachieve this is the use of high-dimensional features from pre-trained\nself-supervised models, which implicitly encode semantic information that\nmassively simplifies the learning task. Further, the development of an online\nscheme for supervision generator enables concurrent training and inference of\nthe learned model in the wild. We demonstrate our approach through diverse\nreal-world deployments in forests, parks, and grasslands. Our system is able to\nbootstrap the traversable terrain segmentation in less than 5 min of in-field\ntraining time, enabling the robot to navigate in complex, previously unseen\noutdoor terrains. Code: https://bit.ly/498b0CV - Project\npage:https://bit.ly/3M6nMHH\n","authors":["Matías Mattamala","Jonas Frey","Piotr Libera","Nived Chebrolu","Georg Martius","Cesar Cadena","Marco Hutter","Maurice Fallon"],"pdf_url":"https://arxiv.org/pdf/2404.07110v1.pdf","comment":"Extended version of arXiv:2305.08510"},{"id":"http://arxiv.org/abs/2404.07106v1","updated":"2024-04-10T15:45:03Z","published":"2024-04-10T15:45:03Z","title":"3DMambaComplete: Exploring Structured State Space Model for Point Cloud\n Completion","summary":" Point cloud completion aims to generate a complete and high-fidelity point\ncloud from an initially incomplete and low-quality input. A prevalent strategy\ninvolves leveraging Transformer-based models to encode global features and\nfacilitate the reconstruction process. However, the adoption of pooling\noperations to obtain global feature representations often results in the loss\nof local details within the point cloud. Moreover, the attention mechanism\ninherent in Transformers introduces additional computational complexity,\nrendering it challenging to handle long sequences effectively. To address these\nissues, we propose 3DMambaComplete, a point cloud completion network built on\nthe novel Mamba framework. It comprises three modules: HyperPoint Generation\nencodes point cloud features using Mamba's selection mechanism and predicts a\nset of Hyperpoints. A specific offset is estimated, and the down-sampled points\nbecome HyperPoints. The HyperPoint Spread module disperses these HyperPoints\nacross different spatial locations to avoid concentration. Finally, a\ndeformation method transforms the 2D mesh representation of HyperPoints into a\nfine-grained 3D structure for point cloud reconstruction. Extensive experiments\nconducted on various established benchmarks demonstrate that 3DMambaComplete\nsurpasses state-of-the-art point cloud completion methods, as confirmed by\nqualitative and quantitative analyses.\n","authors":["Yixuan Li","Weidong Yang","Ben Fei"],"pdf_url":"https://arxiv.org/pdf/2404.07106v1.pdf","comment":"10 pages, 8 figures, 7 tables"},{"id":"http://arxiv.org/abs/2404.07097v1","updated":"2024-04-10T15:37:00Z","published":"2024-04-10T15:37:00Z","title":"Learning Priors for Non Rigid SfM from Casual Videos","summary":" We tackle the long-standing challenge of reconstructing 3D structures and\ncamera positions from videos. The problem is particularly hard when objects are\ntransformed in a non-rigid way. Current approaches to this problem make\nunrealistic assumptions or require a long optimization time.\n We present TracksTo4D, a novel deep learning-based approach that enables\ninferring 3D structure and camera positions from dynamic content originating\nfrom in-the-wild videos using a single feed-forward pass on a sparse point\ntrack matrix. To achieve this, we leverage recent advances in 2D point tracking\nand design an equivariant neural architecture tailored for directly processing\n2D point tracks by leveraging their symmetries. TracksTo4D is trained on a\ndataset of in-the-wild videos utilizing only the 2D point tracks extracted from\nthe videos, without any 3D supervision. Our experiments demonstrate that\nTracksTo4D generalizes well to unseen videos of unseen semantic categories at\ninference time, producing equivalent results to state-of-the-art methods while\nsignificantly reducing the runtime compared to other baselines.\n","authors":["Yoni Kasten","Wuyue Lu","Haggai Maron"],"pdf_url":"https://arxiv.org/pdf/2404.07097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07094v1","updated":"2024-04-10T15:34:10Z","published":"2024-04-10T15:34:10Z","title":"MoCap-to-Visual Domain Adaptation for Efficient Human Mesh Estimation\n from 2D Keypoints","summary":" This paper presents Key2Mesh, a model that takes a set of 2D human pose\nkeypoints as input and estimates the corresponding body mesh. Since this\nprocess does not involve any visual (i.e. RGB image) data, the model can be\ntrained on large-scale motion capture (MoCap) datasets, thereby overcoming the\nscarcity of image datasets with 3D labels. To enable the model's application on\nRGB images, we first run an off-the-shelf 2D pose estimator to obtain the 2D\nkeypoints, and then feed these 2D keypoints to Key2Mesh. To improve the\nperformance of our model on RGB images, we apply an adversarial domain\nadaptation (DA) method to bridge the gap between the MoCap and visual domains.\nCrucially, our DA method does not require 3D labels for visual data, which\nenables adaptation to target sets without the need for costly labels. We\nevaluate Key2Mesh for the task of estimating 3D human meshes from 2D keypoints,\nin the absence of RGB and mesh label pairs. Our results on widely used H3.6M\nand 3DPW datasets show that Key2Mesh sets the new state-of-the-art by\noutperforming other models in PA-MPJPE for both datasets, and in MPJPE and PVE\nfor the 3DPW dataset. Thanks to our model's simple architecture, it operates at\nleast 12x faster than the prior state-of-the-art model, LGD. Additional\nqualitative samples and code are available on the project website:\nhttps://key2mesh.github.io/.\n","authors":["Bedirhan Uguz","Ozhan Suat","Batuhan Karagoz","Emre Akbas"],"pdf_url":"https://arxiv.org/pdf/2404.07094v1.pdf","comment":"accepted to CVPRW 2024"},{"id":"http://arxiv.org/abs/2401.07745v2","updated":"2024-04-10T15:30:23Z","published":"2024-01-15T14:56:15Z","title":"MaskClustering: View Consensus based Mask Graph Clustering for\n Open-Vocabulary 3D Instance Segmentation","summary":" Open-vocabulary 3D instance segmentation is cutting-edge for its ability to\nsegment 3D instances without predefined categories. However, progress in 3D\nlags behind its 2D counterpart due to limited annotated 3D data. To address\nthis, recent works first generate 2D open-vocabulary masks through 2D models\nand then merge them into 3D instances based on metrics calculated between two\nneighboring frames. In contrast to these local metrics, we propose a novel\nmetric, view consensus rate, to enhance the utilization of multi-view\nobservations. The key insight is that two 2D masks should be deemed part of the\nsame 3D instance if a significant number of other 2D masks from different views\ncontain both these two masks. Using this metric as edge weight, we construct a\nglobal mask graph where each mask is a node. Through iterative clustering of\nmasks showing high view consensus, we generate a series of clusters, each\nrepresenting a distinct 3D instance. Notably, our model is training-free.\nThrough extensive experiments on publicly available datasets, including\nScanNet++, ScanNet200 and MatterPort3D, we demonstrate that our method achieves\nstate-of-the-art performance in open-vocabulary 3D instance segmentation. Our\nproject page is at https://pku-epic.github.io/MaskClustering.\n","authors":["Mi Yan","Jiazhao Zhang","Yan Zhu","He Wang"],"pdf_url":"https://arxiv.org/pdf/2401.07745v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02233v2","updated":"2024-04-10T15:22:05Z","published":"2024-04-02T18:40:55Z","title":"Visual Concept Connectome (VCC): Open World Concept Discovery and their\n Interlayer Connections in Deep Models","summary":" Understanding what deep network models capture in their learned\nrepresentations is a fundamental challenge in computer vision. We present a new\nmethodology to understanding such vision models, the Visual Concept Connectome\n(VCC), which discovers human interpretable concepts and their interlayer\nconnections in a fully unsupervised manner. Our approach simultaneously reveals\nfine-grained concepts at a layer, connection weightings across all layers and\nis amendable to global analysis of network structure (e.g., branching pattern\nof hierarchical concept assemblies). Previous work yielded ways to extract\ninterpretable concepts from single layers and examine their impact on\nclassification, but did not afford multilayer concept analysis across an entire\nnetwork architecture. Quantitative and qualitative empirical results show the\neffectiveness of VCCs in the domain of image classification. Also, we leverage\nVCCs for the application of failure mode debugging to reveal where mistakes\narise in deep networks.\n","authors":["Matthew Kowal","Richard P. Wildes","Konstantinos G. Derpanis"],"pdf_url":"https://arxiv.org/pdf/2404.02233v2.pdf","comment":"CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2401.10831v3","updated":"2024-04-10T15:19:07Z","published":"2024-01-19T17:27:21Z","title":"Understanding Video Transformers via Universal Concept Discovery","summary":" This paper studies the problem of concept-based interpretability of\ntransformer representations for videos. Concretely, we seek to explain the\ndecision-making process of video transformers based on high-level,\nspatiotemporal concepts that are automatically discovered. Prior research on\nconcept-based interpretability has concentrated solely on image-level tasks.\nComparatively, video models deal with the added temporal dimension, increasing\ncomplexity and posing challenges in identifying dynamic concepts over time. In\nthis work, we systematically address these challenges by introducing the first\nVideo Transformer Concept Discovery (VTCD) algorithm. To this end, we propose\nan efficient approach for unsupervised identification of units of video\ntransformer representations - concepts, and ranking their importance to the\noutput of a model. The resulting concepts are highly interpretable, revealing\nspatio-temporal reasoning mechanisms and object-centric representations in\nunstructured video models. Performing this analysis jointly over a diverse set\nof supervised and self-supervised representations, we discover that some of\nthese mechanism are universal in video transformers. Finally, we show that VTCD\ncan be used for fine-grained action recognition and video object segmentation.\n","authors":["Matthew Kowal","Achal Dave","Rares Ambrus","Adrien Gaidon","Konstantinos G. Derpanis","Pavel Tokmakov"],"pdf_url":"https://arxiv.org/pdf/2401.10831v3.pdf","comment":"CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2402.18320v2","updated":"2024-04-10T15:09:22Z","published":"2024-02-28T13:33:43Z","title":"Location-guided Head Pose Estimation for Fisheye Image","summary":" Camera with a fisheye or ultra-wide lens covers a wide field of view that\ncannot be modeled by the perspective projection. Serious fisheye lens\ndistortion in the peripheral region of the image leads to degraded performance\nof the existing head pose estimation models trained on undistorted images. This\npaper presents a new approach for head pose estimation that uses the knowledge\nof head location in the image to reduce the negative effect of fisheye\ndistortion. We develop an end-to-end convolutional neural network to estimate\nthe head pose with the multi-task learning of head pose and head location. Our\nproposed network estimates the head pose directly from the fisheye image\nwithout the operation of rectification or calibration. We also created a\nfisheye-distorted version of the three popular head pose estimation datasets,\nBIWI, 300W-LP, and AFLW2000 for our experiments. Experiments results show that\nour network remarkably improves the accuracy of head pose estimation compared\nwith other state-of-the-art one-stage and two-stage methods.\n","authors":["Bing Li","Dong Zhang","Cheng Huang","Yun Xian","Ming Li","Dah-Jye Lee"],"pdf_url":"https://arxiv.org/pdf/2402.18320v2.pdf","comment":"Revised Introduction and Related Work; Submitted to lEEE Transactions\n on Cognitive and Developmental Systems for review"},{"id":"http://arxiv.org/abs/2404.07078v1","updated":"2024-04-10T15:09:15Z","published":"2024-04-10T15:09:15Z","title":"VLLMs Provide Better Context for Emotion Understanding Through Common\n Sense Reasoning","summary":" Recognising emotions in context involves identifying the apparent emotions of\nan individual, taking into account contextual cues from the surrounding scene.\nPrevious approaches to this task have involved the design of explicit\nscene-encoding architectures or the incorporation of external scene-related\ninformation, such as captions. However, these methods often utilise limited\ncontextual information or rely on intricate training pipelines. In this work,\nwe leverage the groundbreaking capabilities of Vision-and-Large-Language Models\n(VLLMs) to enhance in-context emotion classification without introducing\ncomplexity to the training process in a two-stage approach. In the first stage,\nwe propose prompting VLLMs to generate descriptions in natural language of the\nsubject's apparent emotion relative to the visual context. In the second stage,\nthe descriptions are used as contextual information and, along with the image\ninput, are used to train a transformer-based architecture that fuses text and\nvisual features before the final classification task. Our experimental results\nshow that the text and image features have complementary information, and our\nfused architecture significantly outperforms the individual modalities without\nany complex training methods. We evaluate our approach on three different\ndatasets, namely, EMOTIC, CAER-S, and BoLD, and achieve state-of-the-art or\ncomparable accuracy across all datasets and metrics compared to much more\ncomplex approaches. The code will be made publicly available on github:\nhttps://github.com/NickyFot/EmoCommonSense.git\n","authors":["Alexandros Xenos","Niki Maria Foteinopoulou","Ioanna Ntinou","Ioannis Patras","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2404.07078v1.pdf","comment":"A. Xenos, N. Foteinopoulou and I. Ntinou contributed equally to this\n work; 14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.07072v1","updated":"2024-04-10T15:02:26Z","published":"2024-04-10T15:02:26Z","title":"Implicit Multi-Spectral Transformer: An Lightweight and Effective\n Visible to Infrared Image Translation Model","summary":" In the field of computer vision, visible light images often exhibit low\ncontrast in low-light conditions, presenting a significant challenge. While\ninfrared imagery provides a potential solution, its utilization entails high\ncosts and practical limitations. Recent advancements in deep learning,\nparticularly the deployment of Generative Adversarial Networks (GANs), have\nfacilitated the transformation of visible light images to infrared images.\nHowever, these methods often experience unstable training phases and may\nproduce suboptimal outputs. To address these issues, we propose a novel\nend-to-end Transformer-based model that efficiently converts visible light\nimages into high-fidelity infrared images. Initially, the Texture Mapping\nModule and Color Perception Adapter collaborate to extract texture and color\nfeatures from the visible light image. The Dynamic Fusion Aggregation Module\nsubsequently integrates these features. Finally, the transformation into an\ninfrared image is refined through the synergistic action of the Color\nPerception Adapter and the Enhanced Perception Attention mechanism.\nComprehensive benchmarking experiments confirm that our model outperforms\nexisting methods, producing infrared images of markedly superior quality, both\nqualitatively and quantitatively. Furthermore, the proposed model enables more\neffective downstream applications for infrared images than other methods.\n","authors":["Yijia Chen","Pinghua Chen","Xiangxin Zhou","Yingtie Lei","Ziyang Zhou","Mingxian Li"],"pdf_url":"https://arxiv.org/pdf/2404.07072v1.pdf","comment":"Accepted by IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.07045v1","updated":"2024-04-10T14:35:22Z","published":"2024-04-10T14:35:22Z","title":"Identification of Fine-grained Systematic Errors via Controlled Scene\n Generation","summary":" Many safety-critical applications, especially in autonomous driving, require\nreliable object detectors. They can be very effectively assisted by a method to\nsearch for and identify potential failures and systematic errors before these\ndetectors are deployed. Systematic errors are characterized by combinations of\nattributes such as object location, scale, orientation, and color, as well as\nthe composition of their respective backgrounds. To identify them, one must\nrely on something other than real images from a test set because they do not\naccount for very rare but possible combinations of attributes. To overcome this\nlimitation, we propose a pipeline for generating realistic synthetic scenes\nwith fine-grained control, allowing the creation of complex scenes with\nmultiple objects. Our approach, BEV2EGO, allows for a realistic generation of\nthe complete scene with road-contingent control that maps 2D bird's-eye view\n(BEV) scene configurations to a first-person view (EGO). In addition, we\npropose a benchmark for controlled scene generation to select the most\nappropriate generative outpainting model for BEV2EGO. We further use it to\nperform a systematic analysis of multiple state-of-the-art object detection\nmodels and discover differences between them.\n","authors":["Valentyn Boreiko","Matthias Hein","Jan Hendrik Metzen"],"pdf_url":"https://arxiv.org/pdf/2404.07045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07032v1","updated":"2024-04-10T14:25:23Z","published":"2024-04-10T14:25:23Z","title":"An Evidential-enhanced Tri-Branch Consistency Learning Method for\n Semi-supervised Medical Image Segmentation","summary":" Semi-supervised segmentation presents a promising approach for large-scale\nmedical image analysis, effectively reducing annotation burdens while achieving\ncomparable performance. This methodology holds substantial potential for\nstreamlining the segmentation process and enhancing its feasibility within\nclinical settings for translational investigations. While cross-supervised\ntraining, based on distinct co-training sub-networks, has become a prevalent\nparadigm for this task, addressing critical issues such as predication\ndisagreement and label-noise suppression requires further attention and\nprogress in cross-supervised training. In this paper, we introduce an\nEvidential Tri-Branch Consistency learning framework (ETC-Net) for\nsemi-supervised medical image segmentation. ETC-Net employs three branches: an\nevidential conservative branch, an evidential progressive branch, and an\nevidential fusion branch. The first two branches exhibit complementary\ncharacteristics, allowing them to address prediction diversity and enhance\ntraining stability. We also integrate uncertainty estimation from the\nevidential learning into cross-supervised training, mitigating the negative\nimpact of erroneous supervision signals. Additionally, the evidential fusion\nbranch capitalizes on the complementary attributes of the first two branches\nand leverages an evidence-based Dempster-Shafer fusion strategy, supervised by\nmore reliable and accurate pseudo-labels of unlabeled data. Extensive\nexperiments conducted on LA, Pancreas-CT, and ACDC datasets demonstrate that\nETC-Net surpasses other state-of-the-art methods for semi-supervised\nsegmentation. The code will be made available in the near future at\nhttps://github.com/Medsemiseg.\n","authors":["Zhenxi Zhang","Heng Zhou","Xiaoran Shi","Ran Ran","Chunna Tian","Feng Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.07032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10166v2","updated":"2024-04-10T14:25:12Z","published":"2024-01-18T17:55:39Z","title":"VMamba: Visual State Space Model","summary":" Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs) have long\nbeen the predominant backbone networks for visual representation learning.\nWhile ViTs have recently gained prominence over CNNs due to their superior\nfitting capabilities, their scalability is largely constrained by the quadratic\ncomplexity of attention computation. Inspired by the capability of Mamba in\nefficiently modeling long sequences, we propose VMamba, a generic vision\nbackbone model aiming to reduce the computational complexity to linear while\nretaining ViTs' advantageous features. To enhance VMamba's adaptability in\nprocessing vision data, we introduce the Cross-Scan Module (CSM) to enable 1D\nselective scanning in 2D image space with global receptive fields.\nAdditionally, we make further improvements in implementation details and\narchitectural designs to enhance VMamba's performance and boost its inference\nspeed. Extensive experimental results demonstrate VMamba's promising\nperformance across various visual perception tasks, highlighting its pronounced\nadvantages in input scaling efficiency compared to existing benchmark models.\nSource code is available at https://github.com/MzeroMiko/VMamba.\n","authors":["Yue Liu","Yunjie Tian","Yuzhong Zhao","Hongtian Yu","Lingxi Xie","Yaowei Wang","Qixiang Ye","Yunfan Liu"],"pdf_url":"https://arxiv.org/pdf/2401.10166v2.pdf","comment":"21 pages, 12 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.07031v1","updated":"2024-04-10T14:24:10Z","published":"2024-04-10T14:24:10Z","title":"ORacle: Large Vision-Language Models for Knowledge-Guided Holistic OR\n Domain Modeling","summary":" Every day, countless surgeries are performed worldwide, each within the\ndistinct settings of operating rooms (ORs) that vary not only in their setups\nbut also in the personnel, tools, and equipment used. This inherent diversity\nposes a substantial challenge for achieving a holistic understanding of the OR,\nas it requires models to generalize beyond their initial training datasets. To\nreduce this gap, we introduce ORacle, an advanced vision-language model\ndesigned for holistic OR domain modeling, which incorporates multi-view and\ntemporal capabilities and can leverage external knowledge during inference,\nenabling it to adapt to previously unseen surgical scenarios. This capability\nis further enhanced by our novel data augmentation framework, which\nsignificantly diversifies the training dataset, ensuring ORacle's proficiency\nin applying the provided knowledge effectively. In rigorous testing, in scene\ngraph generation, and downstream tasks on the 4D-OR dataset, ORacle not only\ndemonstrates state-of-the-art performance but does so requiring less data than\nexisting models. Furthermore, its adaptability is displayed through its ability\nto interpret unseen views, actions, and appearances of tools and equipment.\nThis demonstrates ORacle's potential to significantly enhance the scalability\nand affordability of OR domain modeling and opens a pathway for future\nadvancements in surgical data science. We will release our code and data upon\nacceptance.\n","authors":["Ege Özsoy","Chantal Pellegrini","Matthias Keicher","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2404.07031v1.pdf","comment":"11 pages, 3 figures, 7 tables"},{"id":"http://arxiv.org/abs/2404.07029v1","updated":"2024-04-10T14:22:16Z","published":"2024-04-10T14:22:16Z","title":"Diffusion-based inpainting of incomplete Euclidean distance matrices of\n trajectories generated by a fractional Brownian motion","summary":" Fractional Brownian trajectories (fBm) feature both randomness and strong\nscale-free correlations, challenging generative models to reproduce the\nintrinsic memory characterizing the underlying process. Here we test a\ndiffusion probabilistic model on a specific dataset of corrupted images\ncorresponding to incomplete Euclidean distance matrices of fBm at various\nmemory exponents $H$. Our dataset implies uniqueness of the data imputation in\nthe regime of low missing ratio, where the remaining partial graph is rigid,\nproviding the ground truth for the inpainting. We find that the conditional\ndiffusion generation stably reproduces the statistics of missing\nfBm-distributed distances for different values of $H$ exponent. Furthermore,\nwhile diffusion models have been recently shown to remember samples from the\ntraining database, we show that diffusion-based inpainting behaves\nqualitatively different from the database search with the increasing database\nsize. Finally, we apply our fBm-trained diffusion model with $H=1/3$ for\ncompletion of chromosome distance matrices obtained in single-cell microscopy\nexperiments, showing its superiority over the standard bioinformatics\nalgorithms. Our source code is available on GitHub at\nhttps://github.com/alobashev/diffusion_fbm.\n","authors":["Alexander Lobashev","Kirill Polovnikov"],"pdf_url":"https://arxiv.org/pdf/2404.07029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10144v4","updated":"2024-04-10T13:58:08Z","published":"2023-12-15T19:00:07Z","title":"Data-Efficient Multimodal Fusion on a Single GPU","summary":" The goal of multimodal alignment is to learn a single latent space that is\nshared between multimodal inputs. The most powerful models in this space have\nbeen trained using massive datasets of paired inputs and large-scale\ncomputational resources, making them prohibitively expensive to train in many\npractical scenarios. We surmise that existing unimodal encoders pre-trained on\nlarge amounts of unimodal data should provide an effective bootstrap to create\nmultimodal models from unimodal ones at much lower costs. We therefore propose\nFuseMix, a multimodal augmentation scheme that operates on the latent spaces of\narbitrary pre-trained unimodal encoders. Using FuseMix for multimodal\nalignment, we achieve competitive performance -- and in certain cases\noutperform state-of-the art methods -- in both image-text and audio-text\nretrieval, with orders of magnitude less compute and data: for example, we\noutperform CLIP on the Flickr30K text-to-image retrieval task with $\\sim \\!\n600\\times$ fewer GPU days and $\\sim \\! 80\\times$ fewer image-text pairs.\nAdditionally, we show how our method can be applied to convert pre-trained\ntext-to-image generative models into audio-to-image ones. Code is available at:\nhttps://github.com/layer6ai-labs/fusemix.\n","authors":["Noël Vouitsis","Zhaoyan Liu","Satya Krishna Gorti","Valentin Villecroze","Jesse C. Cresswell","Guangwei Yu","Gabriel Loaiza-Ganem","Maksims Volkovs"],"pdf_url":"https://arxiv.org/pdf/2312.10144v4.pdf","comment":"CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2307.12256v2","updated":"2024-04-10T13:43:54Z","published":"2023-07-23T08:02:37Z","title":"Building-road Collaborative Extraction from Remotely Sensed Images via\n Cross-Interaction","summary":" Buildings are the basic carrier of social production and human life; roads\nare the links that interconnect social networks. Building and road information\nhas important application value in the frontier fields of regional coordinated\ndevelopment, disaster prevention, auto-driving, etc. Mapping buildings and\nroads from very high-resolution (VHR) remote sensing images have become a hot\nresearch topic. However, the existing methods often ignore the strong spatial\ncorrelation between roads and buildings and extract them in isolation. To fully\nutilize the complementary advantages between buildings and roads, we propose a\nbuilding-road collaborative extraction method based on multi-task and\ncross-scale feature interaction to improve the accuracy of both tasks in a\ncomplementary way. A multi-task interaction module is proposed to interact\ninformation across tasks and preserve the unique information of each task,\nwhich tackle the seesaw phenomenon in multitask learning. By considering the\nvariation in appearance and structure between buildings and roads, a\ncross-scale interaction module is designed to automatically learn the optimal\nreception field for different tasks. Compared with many existing methods that\ntrain each task individually, the proposed collaborative extraction method can\nutilize the complementary advantages between buildings and roads by the\nproposed inter-task and inter-scale feature interactions, and automatically\nselect the optimal reception field for different tasks. Experiments on a wide\nrange of urban and rural scenarios show that the proposed algorithm can achieve\nbuilding-road extraction with outstanding performance and efficiency.\n","authors":["Haonan Guo","Xin Su","Chen Wu","Bo Du","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12256v2.pdf","comment":"IEEE Transactions on Geoscience and Remote Sensing"},{"id":"http://arxiv.org/abs/2312.07937v5","updated":"2024-04-10T13:35:51Z","published":"2023-12-13T07:30:19Z","title":"BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics","summary":" The recently emerging text-to-motion advances have spired numerous attempts\nfor convenient and interactive human motion generation. Yet, existing methods\nare largely limited to generating body motions only without considering the\nrich two-hand motions, let alone handling various conditions like body dynamics\nor texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal\ndataset for two-hand motion generation. Our dataset includes accurate motion\ntracking for the human body and hands and provides pair-wised finger-level hand\nannotations and body descriptions. We further provide a strong baseline method,\nBOTH2Hands, for the novel task: generating vivid two-hand motions from both\nimplicit body dynamics and explicit text prompts. We first warm up two parallel\nbody-to-hand and text-to-hand diffusion models and then utilize the\ncross-attention transformer for motion blending. Extensive experiments and\ncross-validations demonstrate the effectiveness of our approach and dataset for\ngenerating convincing two-hand motions from the hybrid body-and-textual\nconditions. Our dataset and code will be disseminated to the community for\nfuture research.\n","authors":["Wenqian Zhang","Molin Huang","Yuxuan Zhou","Juze Zhang","Jingyi Yu","Jingya Wang","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.07937v5.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05317v2","updated":"2024-04-10T13:30:09Z","published":"2024-04-08T09:08:43Z","title":"WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A\n Conceptual Architecture","summary":" This work proposes a WebXR-based cross-platform conceptual architecture,\nleveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate\nthe development of an open, accessible, and interoperable metaverse. By\nintroducing the concept of spatial web app, this research contributes to the\ndiscourse on the metaverse, offering an architecture that democratizes access\nto virtual environments and extended reality through the web, and aligns with\nTim Berners-Lee's original vision of the World Wide Web as an open platform in\nthe digital realm.\n","authors":["Giuseppe Macario"],"pdf_url":"https://arxiv.org/pdf/2404.05317v2.pdf","comment":"minor fixes (typos, URLs etc.)"},{"id":"http://arxiv.org/abs/2309.06067v6","updated":"2024-04-10T13:17:52Z","published":"2023-09-12T09:07:03Z","title":"Implicit Neural Representation for MRI Parallel Imaging Reconstruction","summary":" Magnetic resonance imaging (MRI) usually faces lengthy acquisition times,\nprompting the exploration of strategies such as parallel imaging (PI) to\nalleviate this problem by periodically skipping specific K-space lines and\nsubsequently reconstructing high-quality images from the undersampled K-space.\nImplicit neural representation (INR) has recently emerged as a promising deep\nlearning technique, characterizing objects as continuous functions of spatial\ncoordinates typically parameterized by a multilayer perceptron (MLP). In this\nstudy, we propose a novel MRI PI reconstruction method that uses INR. Our\napproach represents reconstructed fully-sampled images as functions of voxel\ncoordinates and prior feature vectors from undersampled images, addressing the\ngeneralization challenges of INR. Specifically, we introduce a scale-embedded\nencoder to generate scale-independent, voxel-specific features from MR images\nacross various undersampling scales. These features are then concatenated with\ncoordinate vectors to reconstruct fully-sampled MR images, facilitating\nmultiple-scale reconstructions. To evaluate our method's performance, we\nconducted experiments using publicly available MRI datasets, comparing it with\nalternative reconstruction techniques. Our quantitative assessment demonstrates\nthe superiority of our proposed method.\n","authors":["Hao Li","Yusheng Zhou","Jianan Liu","Xiling Liu","Tao Huang","Zhihan Lv","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2309.06067v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12220v2","updated":"2024-04-10T13:15:41Z","published":"2023-07-23T03:55:13Z","title":"Expediting Building Footprint Extraction from High-resolution Remote\n Sensing Images via progressive lenient supervision","summary":" The efficacy of building footprint segmentation from remotely sensed images\nhas been hindered by model transfer effectiveness. Many existing building\nsegmentation methods were developed upon the encoder-decoder architecture of\nU-Net, in which the encoder is finetuned from the newly developed backbone\nnetworks that are pre-trained on ImageNet. However, the heavy computational\nburden of the existing decoder designs hampers the successful transfer of these\nmodern encoder networks to remote sensing tasks. Even the widely-adopted deep\nsupervision strategy fails to mitigate these challenges due to its invalid loss\nin hybrid regions where foreground and background pixels are intermixed. In\nthis paper, we conduct a comprehensive evaluation of existing decoder network\ndesigns for building footprint segmentation and propose an efficient framework\ndenoted as BFSeg to enhance learning efficiency and effectiveness.\nSpecifically, a densely-connected coarse-to-fine feature fusion decoder network\nthat facilitates easy and fast feature fusion across scales is proposed.\nMoreover, considering the invalidity of hybrid regions in the down-sampled\nground truth during the deep supervision process, we present a lenient deep\nsupervision and distillation strategy that enables the network to learn proper\nknowledge from deep supervision. Building upon these advancements, we have\ndeveloped a new family of building segmentation networks, which consistently\nsurpass prior works with outstanding performance and efficiency across a wide\nrange of newly developed encoder networks.\n","authors":["Haonan Guo","Bo Du","Chen Wu","Xin Su","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12220v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06991v1","updated":"2024-04-10T13:10:52Z","published":"2024-04-10T13:10:52Z","title":"Ray-driven Spectral CT Reconstruction Based on Neural Base-Material\n Fields","summary":" In spectral CT reconstruction, the basis materials decomposition involves\nsolving a large-scale nonlinear system of integral equations, which is highly\nill-posed mathematically. This paper proposes a model that parameterizes the\nattenuation coefficients of the object using a neural field representation,\nthereby avoiding the complex calculations of pixel-driven projection\ncoefficient matrices during the discretization process of line integrals. It\nintroduces a lightweight discretization method for line integrals based on a\nray-driven neural field, enhancing the accuracy of the integral approximation\nduring the discretization process. The basis materials are represented as\ncontinuous vector-valued implicit functions to establish a neural field\nparameterization model for the basis materials. The auto-differentiation\nframework of deep learning is then used to solve the implicit continuous\nfunction of the neural base-material fields. This method is not limited by the\nspatial resolution of reconstructed images, and the network has compact and\nregular properties. Experimental validation shows that our method performs\nexceptionally well in addressing the spectral CT reconstruction. Additionally,\nit fulfils the requirements for the generation of high-resolution\nreconstruction images.\n","authors":["Ligen Shi","Chang Liu","Ping Yang","Jun Qiu","Xing Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.06991v1.pdf","comment":"14 pages,16 figures"},{"id":"http://arxiv.org/abs/2404.01563v2","updated":"2024-04-10T13:02:59Z","published":"2024-04-02T01:57:08Z","title":"Two-Phase Multi-Dose-Level PET Image Reconstruction with Dose Level\n Awareness","summary":" To obtain high-quality positron emission tomography (PET) while minimizing\nradiation exposure, a range of methods have been designed to reconstruct\nstandard-dose PET (SPET) from corresponding low-dose PET (LPET) images.\nHowever, most current methods merely learn the mapping between\nsingle-dose-level LPET and SPET images, but omit the dose disparity of LPET\nimages in clinical scenarios. In this paper, to reconstruct high-quality SPET\nimages from multi-dose-level LPET images, we design a novel two-phase\nmulti-dose-level PET reconstruction algorithm with dose level awareness,\ncontaining a pre-training phase and a SPET prediction phase. Specifically, the\npre-training phase is devised to explore both fine-grained discriminative\nfeatures and effective semantic representation. The SPET prediction phase\nadopts a coarse prediction network utilizing pre-learned dose level prior to\ngenerate preliminary result, and a refinement network to precisely preserve the\ndetails. Experiments on MICCAI 2022 Ultra-low Dose PET Imaging Challenge\nDataset have demonstrated the superiority of our method.\n","authors":["Yuchen Fei","Yanmei Luo","Yan Wang","Jiaqi Cui","Yuanyuan Xu","Jiliu Zhou","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.01563v2.pdf","comment":"Accepted by ISBI2024"},{"id":"http://arxiv.org/abs/2404.06033v2","updated":"2024-04-10T12:55:49Z","published":"2024-04-09T05:44:00Z","title":"Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for\n Multi-exposure Image Fusion","summary":" In recent years, deep learning networks have made remarkable strides in the\ndomain of multi-exposure image fusion. Nonetheless, prevailing approaches often\ninvolve directly feeding over-exposed and under-exposed images into the\nnetwork, which leads to the under-utilization of inherent information present\nin the source images. Additionally, unsupervised techniques predominantly\nemploy rudimentary weighted summation for color channel processing, culminating\nin an overall desaturated final image tone. To partially mitigate these issues,\nthis study proposes a gamma correction module specifically designed to fully\nleverage latent information embedded within source images. Furthermore, a\nmodified transformer block, embracing with self-attention mechanisms, is\nintroduced to optimize the fusion process. Ultimately, a novel color\nenhancement algorithm is presented to augment image saturation while preserving\nintricate details. The source code is available at\nhttps://github.com/ZhiyingDu/BHFMEF.\n","authors":["Pan Mu","Zhiying Du","Jinyuan Liu","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2404.06033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02265v2","updated":"2024-04-10T12:54:12Z","published":"2023-10-03T17:59:58Z","title":"DREAM: Visual Decoding from Reversing Human Visual System","summary":" In this work we present DREAM, an fMRI-to-image method for reconstructing\nviewed images from brain activities, grounded on fundamental knowledge of the\nhuman visual system. We craft reverse pathways that emulate the hierarchical\nand parallel nature of how humans perceive the visual world. These tailored\npathways are specialized to decipher semantics, color, and depth cues from fMRI\ndata, mirroring the forward pathways from visual stimuli to fMRI recordings. To\ndo so, two components mimic the inverse processes within the human visual\nsystem: the Reverse Visual Association Cortex (R-VAC) which reverses pathways\nof this brain region, extracting semantics from fMRI data; the Reverse Parallel\nPKM (R-PKM) component simultaneously predicting color and depth from fMRI\nsignals. The experiments indicate that our method outperforms the current\nstate-of-the-art models in terms of the consistency of appearance, structure,\nand semantics. Code will be made publicly available to facilitate further\nresearch in this field.\n","authors":["Weihao Xia","Raoul de Charette","Cengiz Öztireli","Jing-Hao Xue"],"pdf_url":"https://arxiv.org/pdf/2310.02265v2.pdf","comment":"Project Page: https://weihaox.github.io/DREAM"},{"id":"http://arxiv.org/abs/2404.06977v1","updated":"2024-04-10T12:45:27Z","published":"2024-04-10T12:45:27Z","title":"Accurate Tennis Court Line Detection on Amateur Recorded Matches","summary":" Typically, tennis court line detection is done by running\nHough-Line-Detection to find straight lines in the image, and then computing a\ntransformation matrix from the detected lines to create the final court\nstructure. We propose numerous improvements and enhancements to this algorithm,\nincluding using pretrained State-of-the-Art shadow-removal and object-detection\nML models to make our line-detection more robust. Compared to the original\nalgorithm, our method can accurately detect lines on amateur, dirty courts.\nWhen combined with a robust ball-tracking system, our method will enable\naccurate, automatic refereeing for amateur and professional tennis matches\nalike.\n","authors":["Sameer Agrawal","Ragoth Sundararajan","Vishak Sagar"],"pdf_url":"https://arxiv.org/pdf/2404.06977v1.pdf","comment":"Accepted to 5th International conference on Image, Video Processing\n and Artificial Intelligence"},{"id":"http://arxiv.org/abs/2404.06971v1","updated":"2024-04-10T12:31:43Z","published":"2024-04-10T12:31:43Z","title":"TrajPRed: Trajectory Prediction with Region-based Relation Learning","summary":" Forecasting human trajectories in traffic scenes is critical for safety\nwithin mixed or fully autonomous systems. Human future trajectories are driven\nby two major stimuli, social interactions, and stochastic goals. Thus, reliable\nforecasting needs to capture these two stimuli. Edge-based relation modeling\nrepresents social interactions using pairwise correlations from precise\nindividual states. Nevertheless, edge-based relations can be vulnerable under\nperturbations. To alleviate these issues, we propose a region-based relation\nlearning paradigm that models social interactions via region-wise dynamics of\njoint states, i.e., the changes in the density of crowds. In particular,\nregion-wise agent joint information is encoded within convolutional feature\ngrids. Social relations are modeled by relating the temporal changes of local\njoint information from a global perspective. We show that region-based\nrelations are less susceptible to perturbations. In order to account for the\nstochastic individual goals, we exploit a conditional variational autoencoder\nto realize multi-goal estimation and diverse future prediction. Specifically,\nwe perform variational inference via the latent distribution, which is\nconditioned on the correlation between input states and associated target\ngoals. Sampling from the latent distribution enables the framework to reliably\ncapture the stochastic behavior in test data. We integrate multi-goal\nestimation and region-based relation learning to model the two stimuli, social\ninteractions, and stochastic goals, in a prediction framework. We evaluate our\nframework on the ETH-UCY dataset and Stanford Drone Dataset (SDD). We show that\nthe diverse prediction better fits the ground truth when incorporating the\nrelation module. Our framework outperforms the state-of-the-art models on SDD\nby $27.61\\%$/$18.20\\%$ of ADE/FDE metrics.\n","authors":["Chen Zhou","Ghassan AlRegib","Armin Parchami","Kunjan Singh"],"pdf_url":"https://arxiv.org/pdf/2404.06971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06963v1","updated":"2024-04-10T12:22:19Z","published":"2024-04-10T12:22:19Z","title":"V-MAD: Video-based Morphing Attack Detection in Operational Scenarios","summary":" In response to the rising threat of the face morphing attack, this paper\nintroduces and explores the potential of Video-based Morphing Attack Detection\n(V-MAD) systems in real-world operational scenarios. While current morphing\nattack detection methods primarily focus on a single or a pair of images, V-MAD\nis based on video sequences, exploiting the video streams often acquired by\nface verification tools available, for instance, at airport gates. Through this\nstudy, we show for the first time the advantages that the availability of\nmultiple probe frames can bring to the morphing attack detection task,\nespecially in scenarios where the quality of probe images is varied and might\nbe affected, for instance, by pose or illumination variations. Experimental\nresults on a real operational database demonstrate that video sequences\nrepresent valuable information for increasing the robustness and performance of\nmorphing attack detection systems.\n","authors":["Guido Borghi","Annalisa Franco","Nicolò Di Domenico","Matteo Ferrara","Davide Maltoni"],"pdf_url":"https://arxiv.org/pdf/2404.06963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06957v1","updated":"2024-04-10T12:17:25Z","published":"2024-04-10T12:17:25Z","title":"Adversarial purification for no-reference image-quality metrics:\n applicability study and new methods","summary":" Recently, the area of adversarial attacks on image quality metrics has begun\nto be explored, whereas the area of defences remains under-researched. In this\nstudy, we aim to cover that case and check the transferability of adversarial\npurification defences from image classifiers to IQA methods. In this paper, we\napply several widespread attacks on IQA models and examine the success of the\ndefences against them. The purification methodologies covered different\npreprocessing techniques, including geometrical transformations, compression,\ndenoising, and modern neural network-based methods. Also, we address the\nchallenge of assessing the efficacy of a defensive methodology by proposing\nways to estimate output visual quality and the success of neutralizing attacks.\nDefences were tested against attack on three IQA metrics -- Linearity, MetaIQA\nand SPAQ. The code for attacks and defences is available at: (link is hidden\nfor a blind review).\n","authors":["Aleksandr Gushchin","Anna Chistyakova","Vladislav Minashkin","Anastasia Antsiferova","Dmitriy Vatolin"],"pdf_url":"https://arxiv.org/pdf/2404.06957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04350v3","updated":"2024-04-10T11:58:24Z","published":"2024-01-09T04:33:03Z","title":"Pre-trained Model Guided Fine-Tuning for Zero-Shot Adversarial\n Robustness","summary":" Large-scale pre-trained vision-language models like CLIP have demonstrated\nimpressive performance across various tasks, and exhibit remarkable zero-shot\ngeneralization capability, while they are also vulnerable to imperceptible\nadversarial examples. Existing works typically employ adversarial training\n(fine-tuning) as a defense method against adversarial examples. However, direct\napplication to the CLIP model may result in overfitting, compromising the\nmodel's capacity for generalization. In this paper, we propose Pre-trained\nModel Guided Adversarial Fine-Tuning (PMG-AFT) method, which leverages\nsupervision from the original pre-trained model by carefully designing an\nauxiliary branch, to enhance the model's zero-shot adversarial robustness.\nSpecifically, PMG-AFT minimizes the distance between the features of\nadversarial examples in the target model and those in the pre-trained model,\naiming to preserve the generalization features already captured by the\npre-trained model. Extensive Experiments on 15 zero-shot datasets demonstrate\nthat PMG-AFT significantly outperforms the state-of-the-art method, improving\nthe top-1 robust accuracy by an average of 4.99%. Furthermore, our approach\nconsistently improves clean accuracy by an average of 8.72%. Our code is\navailable at\nhttps://github.com/serendipity1122/Pre-trained-Model-Guided-Fine-Tuning-for-Zero-Shot-Adversarial-Robustness.\n","authors":["Sibo Wang","Jie Zhang","Zheng Yuan","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2401.04350v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2312.06275v3","updated":"2024-04-10T11:49:05Z","published":"2023-12-11T10:26:21Z","title":"DG-TTA: Out-of-domain medical image segmentation through Domain\n Generalization and Test-Time Adaptation","summary":" Applying pre-trained medical segmentation models on out-of-domain images\noften yields predictions of insufficient quality. Several strategies have been\nproposed to maintain model performance, such as finetuning or unsupervised- and\nsource-free domain adaptation. These strategies set restrictive requirements\nfor data availability. In this study, we propose to combine domain\ngeneralization and test-time adaptation to create a highly effective approach\nfor reusing pre-trained models in unseen target domains. Domain-generalized\npre-training on source data is used to obtain the best initial performance in\nthe target domain. We introduce the MIND descriptor previously used in image\nregistration tasks as a further technique to achieve generalization and present\nsuperior performance for small-scale datasets compared to existing approaches.\nAt test-time, high-quality segmentation for every single unseen scan is ensured\nby optimizing the model weights for consistency given different image\naugmentations. That way, our method enables separate use of source and target\ndata and thus removes current data availability barriers. Moreover, the\npresented method is highly modular as it does not require specific model\narchitectures or prior knowledge of involved domains and labels. We demonstrate\nthis by integrating it into the nnUNet, which is currently the most popular and\naccurate framework for medical image segmentation. We employ multiple datasets\ncovering abdominal, cardiac, and lumbar spine scans and compose several\nout-of-domain scenarios in this study. We demonstrate that our method, combined\nwith pre-trained whole-body CT models, can effectively segment MR images with\nhigh accuracy in all of the aforementioned scenarios. Open-source code can be\nfound here: https://github.com/multimodallearning/DG-TTA\n","authors":["Christian Weihsbach","Christian N. Kruse","Alexander Bigalke","Mattias P. Heinrich"],"pdf_url":"https://arxiv.org/pdf/2312.06275v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06941v1","updated":"2024-04-10T11:47:51Z","published":"2024-04-10T11:47:51Z","title":"Accelerating Cardiac MRI Reconstruction with CMRatt: An Attention-Driven\n Approach","summary":" Cine cardiac magnetic resonance (CMR) imaging is recognised as the benchmark\nmodality for the comprehensive assessment of cardiac function. Nevertheless,\nthe acquisition process of cine CMR is considered as an impediment due to its\nprolonged scanning time. One commonly used strategy to expedite the acquisition\nprocess is through k-space undersampling, though it comes with a drawback of\nintroducing aliasing effects in the reconstructed image. Lately, deep\nlearning-based methods have shown remarkable results over traditional\napproaches in rapidly achieving precise CMR reconstructed images. This study\naims to explore the untapped potential of attention mechanisms incorporated\nwith a deep learning model within the context of the CMR reconstruction\nproblem. We are motivated by the fact that attention has proven beneficial in\ndownstream tasks such as image classification and segmentation, but has not\nbeen systematically analysed in the context of CMR reconstruction. Our primary\ngoal is to identify the strengths and potential limitations of attention\nalgorithms when integrated with a convolutional backbone model such as a U-Net.\nTo achieve this, we benchmark different state-of-the-art spatial and channel\nattention mechanisms on the CMRxRecon dataset and quantitatively evaluate the\nquality of reconstruction using objective metrics. Furthermore, inspired by the\nbest performing attention mechanism, we propose a new, simple yet effective,\nattention pipeline specifically optimised for the task of cardiac image\nreconstruction that outperforms other state-of-the-art attention methods. The\nlayer and model code will be made publicly available.\n","authors":["Anam Hashmi","Julia Dietlmeier","Kathleen M. Curran","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2404.06941v1.pdf","comment":"This paper has been submitted for the 32nd European Signal Processing\n Conference EUSIPCO 2024 in Lyon"},{"id":"http://arxiv.org/abs/2306.10798v3","updated":"2024-04-10T11:42:22Z","published":"2023-06-19T09:38:21Z","title":"ExpPoint-MAE: Better interpretability and performance for\n self-supervised point cloud transformers","summary":" In this paper we delve into the properties of transformers, attained through\nself-supervision, in the point cloud domain. Specifically, we evaluate the\neffectiveness of Masked Autoencoding as a pretraining scheme, and explore\nMomentum Contrast as an alternative. In our study we investigate the impact of\ndata quantity on the learned features, and uncover similarities in the\ntransformer's behavior across domains. Through comprehensive visualiations, we\nobserve that the transformer learns to attend to semantically meaningful\nregions, indicating that pretraining leads to a better understanding of the\nunderlying geometry. Moreover, we examine the finetuning process and its effect\non the learned representations. Based on that, we devise an unfreezing strategy\nwhich consistently outperforms our baseline without introducing any other\nmodifications to the model or the training pipeline, and achieve\nstate-of-the-art results in the classification task among transformer models.\n","authors":["Ioannis Romanelis","Vlassis Fotis","Konstantinos Moustakas","Adrian Munteanu"],"pdf_url":"https://arxiv.org/pdf/2306.10798v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06936v1","updated":"2024-04-10T11:40:02Z","published":"2024-04-10T11:40:02Z","title":"Efficient and Generic Point Model for Lossless Point Cloud Attribute\n Compression","summary":" The past several years have witnessed the emergence of learned point cloud\ncompression (PCC) techniques. However, current learning-based lossless point\ncloud attribute compression (PCAC) methods either suffer from high\ncomputational complexity or deteriorated compression performance. Moreover, the\nsignificant variations in point cloud scale and sparsity encountered in\nreal-world applications make developing an all-in-one neural model a\nchallenging task. In this paper, we propose PoLoPCAC, an efficient and generic\nlossless PCAC method that achieves high compression efficiency and strong\ngeneralizability simultaneously. We formulate lossless PCAC as the task of\ninferring explicit distributions of attributes from group-wise autoregressive\npriors. A progressive random grouping strategy is first devised to efficiently\nresolve the point cloud into groups, and then the attributes of each group are\nmodeled sequentially from accumulated antecedents. A locality-aware attention\nmechanism is utilized to exploit prior knowledge from context windows in\nparallel. Since our method directly operates on points, it can naturally avoids\ndistortion caused by voxelization, and can be executed on point clouds with\narbitrary scale and density. Experiments show that our method can be instantly\ndeployed once trained on a Synthetic 2k-ShapeNet dataset while enjoying\ncontinuous bit-rate reduction over the latest G-PCCv23 on various datasets\n(ShapeNet, ScanNet, MVUB, 8iVFB). Meanwhile, our method reports shorter coding\ntime than G-PCCv23 on the majority of sequences with a lightweight model size\n(2.6MB), which is highly attractive for practical applications. Dataset, code\nand trained model are available at\nhttps://github.com/I2-Multimedia-Lab/PoLoPCAC.\n","authors":["Kang You","Pan Gao","Zhan Ma"],"pdf_url":"https://arxiv.org/pdf/2404.06936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06918v1","updated":"2024-04-10T11:10:50Z","published":"2024-04-10T11:10:50Z","title":"HRVDA: High-Resolution Visual Document Assistant","summary":" Leveraging vast training data, multimodal large language models (MLLMs) have\ndemonstrated formidable general visual comprehension capabilities and achieved\nremarkable performance across various tasks. However, their performance in\nvisual document understanding still leaves much room for improvement. This\ndiscrepancy is primarily attributed to the fact that visual document\nunderstanding is a fine-grained prediction task. In natural scenes, MLLMs\ntypically use low-resolution images, leading to a substantial loss of visual\ninformation. Furthermore, general-purpose MLLMs do not excel in handling\ndocument-oriented instructions. In this paper, we propose a High-Resolution\nVisual Document Assistant (HRVDA), which bridges the gap between MLLMs and\nvisual document understanding. This model employs a content filtering mechanism\nand an instruction filtering module to separately filter out the\ncontent-agnostic visual tokens and instruction-agnostic visual tokens, thereby\nachieving efficient model training and inference for high-resolution images. In\naddition, we construct a document-oriented visual instruction tuning dataset\nand apply a multi-stage training strategy to enhance the model's document\nmodeling capabilities. Extensive experiments demonstrate that our model\nachieves state-of-the-art performance across multiple document understanding\ndatasets, while maintaining training efficiency and inference speed comparable\nto low-resolution models.\n","authors":["Chaohu Liu","Kun Yin","Haoyu Cao","Xinghua Jiang","Xin Li","Yinsong Liu","Deqiang Jiang","Xing Sun","Linli Xu"],"pdf_url":"https://arxiv.org/pdf/2404.06918v1.pdf","comment":"Accepted to CVPR 2024 main conference"},{"id":"http://arxiv.org/abs/2404.06913v1","updated":"2024-04-10T11:06:29Z","published":"2024-04-10T11:06:29Z","title":"Sparse Global Matching for Video Frame Interpolation with Large Motion","summary":" Large motion poses a critical challenge in Video Frame Interpolation (VFI)\ntask. Existing methods are often constrained by limited receptive fields,\nresulting in sub-optimal performance when handling scenarios with large motion.\nIn this paper, we introduce a new pipeline for VFI, which can effectively\nintegrate global-level information to alleviate issues associated with large\nmotion. Specifically, we first estimate a pair of initial intermediate flows\nusing a high-resolution feature map for extracting local details. Then, we\nincorporate a sparse global matching branch to compensate for flow estimation,\nwhich consists of identifying flaws in initial flows and generating sparse flow\ncompensation with a global receptive field. Finally, we adaptively merge the\ninitial flow estimation with global flow compensation, yielding a more accurate\nintermediate flow. To evaluate the effectiveness of our method in handling\nlarge motion, we carefully curate a more challenging subset from commonly used\nbenchmarks. Our method demonstrates the state-of-the-art performance on these\nVFI subsets with large motion.\n","authors":["Chunxu Liu","Guozhen Zhang","Rui Zhao","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06913v1.pdf","comment":"Accepted by CVPR 2024. Project page: https://sgm-vfi.github.io/"},{"id":"http://arxiv.org/abs/2306.00977v4","updated":"2024-04-10T10:56:00Z","published":"2023-06-01T17:59:10Z","title":"AGILE3D: Attention Guided Interactive Multi-object 3D Segmentation","summary":" During interactive segmentation, a model and a user work together to\ndelineate objects of interest in a 3D point cloud. In an iterative process, the\nmodel assigns each data point to an object (or the background), while the user\ncorrects errors in the resulting segmentation and feeds them back into the\nmodel. The current best practice formulates the problem as binary\nclassification and segments objects one at a time. The model expects the user\nto provide positive clicks to indicate regions wrongly assigned to the\nbackground and negative clicks on regions wrongly assigned to the object.\nSequentially visiting objects is wasteful since it disregards synergies between\nobjects: a positive click for a given object can, by definition, serve as a\nnegative click for nearby objects. Moreover, a direct competition between\nadjacent objects can speed up the identification of their common boundary. We\nintroduce AGILE3D, an efficient, attention-based model that (1) supports\nsimultaneous segmentation of multiple 3D objects, (2) yields more accurate\nsegmentation masks with fewer user clicks, and (3) offers faster inference. Our\ncore idea is to encode user clicks as spatial-temporal queries and enable\nexplicit interactions between click queries as well as between them and the 3D\nscene through a click attention module. Every time new clicks are added, we\nonly need to run a lightweight decoder that produces updated segmentation\nmasks. In experiments with four different 3D point cloud datasets, AGILE3D sets\na new state-of-the-art. Moreover, we also verify its practicality in real-world\nsetups with real user studies.\n","authors":["Yuanwen Yue","Sabarinath Mahadevan","Jonas Schult","Francis Engelmann","Bastian Leibe","Konrad Schindler","Theodora Kontogianni"],"pdf_url":"https://arxiv.org/pdf/2306.00977v4.pdf","comment":"ICLR 2024 camera-ready. Project page: https://ywyue.github.io/AGILE3D"},{"id":"http://arxiv.org/abs/2404.06903v1","updated":"2024-04-10T10:46:59Z","published":"2024-04-10T10:46:59Z","title":"DreamScene360: Unconstrained Text-to-3D Scene Generation with Panoramic\n Gaussian Splatting","summary":" The increasing demand for virtual reality applications has highlighted the\nsignificance of crafting immersive 3D assets. We present a text-to-3D\n360$^{\\circ}$ scene generation pipeline that facilitates the creation of\ncomprehensive 360$^{\\circ}$ scenes for in-the-wild environments in a matter of\nminutes. Our approach utilizes the generative power of a 2D diffusion model and\nprompt self-refinement to create a high-quality and globally coherent panoramic\nimage. This image acts as a preliminary \"flat\" (2D) scene representation.\nSubsequently, it is lifted into 3D Gaussians, employing splatting techniques to\nenable real-time exploration. To produce consistent 3D geometry, our pipeline\nconstructs a spatially coherent structure by aligning the 2D monocular depth\ninto a globally optimized point cloud. This point cloud serves as the initial\nstate for the centroids of 3D Gaussians. In order to address invisible issues\ninherent in single-view inputs, we impose semantic and geometric constraints on\nboth synthesized and input camera views as regularizations. These guide the\noptimization of Gaussians, aiding in the reconstruction of unseen regions. In\nsummary, our method offers a globally consistent 3D scene within a\n360$^{\\circ}$ perspective, providing an enhanced immersive experience over\nexisting techniques. Project website at: http://dreamscene360.github.io/\n","authors":["Shijie Zhou","Zhiwen Fan","Dejia Xu","Haoran Chang","Pradyumna Chari","Tejas Bharadwaj","Suya You","Zhangyang Wang","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2404.06903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12796v2","updated":"2024-04-10T10:37:22Z","published":"2023-11-21T18:59:58Z","title":"Physics-guided Shape-from-Template: Monocular Video Perception through\n Neural Surrogate Models","summary":" 3D reconstruction of dynamic scenes is a long-standing problem in computer\ngraphics and increasingly difficult the less information is available.\nShape-from-Template (SfT) methods aim to reconstruct a template-based geometry\nfrom RGB images or video sequences, often leveraging just a single monocular\ncamera without depth information, such as regular smartphone recordings.\nUnfortunately, existing reconstruction methods are either unphysical and noisy\nor slow in optimization. To solve this problem, we propose a novel SfT\nreconstruction algorithm for cloth using a pre-trained neural surrogate model\nthat is fast to evaluate, stable, and produces smooth reconstructions due to a\nregularizing physics simulation. Differentiable rendering of the simulated mesh\nenables pixel-wise comparisons between the reconstruction and a target video\nsequence that can be used for a gradient-based optimization procedure to\nextract not only shape information but also physical parameters such as\nstretching, shearing, or bending stiffness of the cloth. This allows to retain\na precise, stable, and smooth reconstructed geometry while reducing the runtime\nby a factor of 400-500 compared to $\\phi$-SfT, a state-of-the-art physics-based\nSfT approach.\n","authors":["David Stotko","Nils Wandel","Reinhard Klein"],"pdf_url":"https://arxiv.org/pdf/2311.12796v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06894v1","updated":"2024-04-10T10:36:15Z","published":"2024-04-10T10:36:15Z","title":"O-TALC: Steps Towards Combating Oversegmentation within Online Action\n Segmentation","summary":" Online temporal action segmentation shows a strong potential to facilitate\nmany HRI tasks where extended human action sequences must be tracked and\nunderstood in real time. Traditional action segmentation approaches, however,\noperate in an offline two stage approach, relying on computationally expensive\nvideo wide features for segmentation, rendering them unsuitable for online HRI\napplications. In order to facilitate online action segmentation on a stream of\nincoming video data, we introduce two methods for improved training and\ninference of backbone action recognition models, allowing them to be deployed\ndirectly for online frame level classification. Firstly, we introduce surround\ndense sampling whilst training to facilitate training vs. inference clip\nmatching and improve segment boundary predictions. Secondly, we introduce an\nOnline Temporally Aware Label Cleaning (O-TALC) strategy to explicitly reduce\noversegmentation during online inference. As our methods are backbone\ninvariant, they can be deployed with computationally efficient spatio-temporal\naction recognition models capable of operating in real time with a small\nsegmentation latency. We show our method outperforms similar online action\nsegmentation work as well as matches the performance of many offline models\nwith access to full temporal resolution when operating on challenging\nfine-grained datasets.\n","authors":["Matthew Kent Myers","Nick Wright","A. Stephen McGough","Nicholas Martin"],"pdf_url":"https://arxiv.org/pdf/2404.06894v1.pdf","comment":"5 pages, 3 figures. Accepted as a short (unindexed) paper at the\n TAHRI conference"},{"id":"http://arxiv.org/abs/2404.06892v1","updated":"2024-04-10T10:34:34Z","published":"2024-04-10T10:34:34Z","title":"SparseAD: Sparse Query-Centric Paradigm for Efficient End-to-End\n Autonomous Driving","summary":" End-to-End paradigms use a unified framework to implement multi-tasks in an\nautonomous driving system. Despite simplicity and clarity, the performance of\nend-to-end autonomous driving methods on sub-tasks is still far behind the\nsingle-task methods. Meanwhile, the widely used dense BEV features in previous\nend-to-end methods make it costly to extend to more modalities or tasks. In\nthis paper, we propose a Sparse query-centric paradigm for end-to-end\nAutonomous Driving (SparseAD), where the sparse queries completely represent\nthe whole driving scenario across space, time and tasks without any dense BEV\nrepresentation. Concretely, we design a unified sparse architecture for\nperception tasks including detection, tracking, and online mapping. Moreover,\nwe revisit motion prediction and planning, and devise a more justifiable motion\nplanner framework. On the challenging nuScenes dataset, SparseAD achieves SOTA\nfull-task performance among end-to-end methods and significantly narrows the\nperformance gap between end-to-end paradigms and single-task methods. Codes\nwill be released soon.\n","authors":["Diankun Zhang","Guoan Wang","Runwen Zhu","Jianbo Zhao","Xiwu Chen","Siyu Zhang","Jiahao Gong","Qibin Zhou","Wenyuan Zhang","Ningzi Wang","Feiyang Tan","Hangning Zhou","Ziyao Xu","Haotian Yao","Chi Zhang","Xiaojun Liu","Xiaoguang Di","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2404.06892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06883v1","updated":"2024-04-10T10:13:37Z","published":"2024-04-10T10:13:37Z","title":"Research on Detection of Floating Objects in River and Lake Based on AI\n Intelligent Image Recognition","summary":" With the rapid advancement of artificial intelligence technology, AI-enabled\nimage recognition has emerged as a potent tool for addressing challenges in\ntraditional environmental monitoring. This study focuses on the detection of\nfloating objects in river and lake environments, exploring an innovative\napproach based on deep learning. By intricately analyzing the technical\npathways for detecting static and dynamic features and considering the\ncharacteristics of river and lake debris, a comprehensive image acquisition and\nprocessing workflow has been developed. The study highlights the application\nand performance comparison of three mainstream deep learning models -SSD,\nFaster-RCNN, and YOLOv5- in debris identification. Additionally, a detection\nsystem for floating objects has been designed and implemented, encompassing\nboth hardware platform construction and software framework development. Through\nrigorous experimental validation, the proposed system has demonstrated its\nability to significantly enhance the accuracy and efficiency of debris\ndetection, thus offering a new technological avenue for water quality\nmonitoring in rivers and lakes\n","authors":["Jingyu Zhang","Ao Xiang","Yu Cheng","Qin Yang","Liyang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07887v2","updated":"2024-04-10T10:06:46Z","published":"2023-10-11T20:48:20Z","title":"Unsupervised Denoising for Signal-Dependent and Row-Correlated Imaging\n Noise","summary":" Accurate analysis of microscopy images is hindered by the presence of noise.\nThis noise is usually signal-dependent and often additionally correlated along\nrows or columns of pixels. Current self- and unsupervised denoisers can address\nsignal-dependent noise, but none can reliably remove noise that is also row- or\ncolumn-correlated. Here, we present the first fully unsupervised deep\nlearning-based denoiser capable of handling imaging noise that is\nrow-correlated as well as signal-dependent. Our approach uses a Variational\nAutoencoder (VAE) with a specially designed autoregressive decoder. This\ndecoder is capable of modeling row-correlated and signal-dependent noise but is\nincapable of independently modeling underlying clean signal. The VAE therefore\nproduces latent variables containing only clean signal information, and these\nare mapped back into image space using a proposed second decoder network. Our\nmethod does not require a pre-trained noise model and can be trained from\nscratch using unpaired noisy data. We show that our approach achieves\ncompetitive results when applied to a range of different sensor types and\nimaging modalities.\n","authors":["Benjamin Salmon","Alexander Krull"],"pdf_url":"https://arxiv.org/pdf/2310.07887v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03190v5","updated":"2024-04-10T09:51:11Z","published":"2024-03-05T18:29:17Z","title":"Triple-CFN: Restructuring Conceptual Spaces for Enhancing Abstract\n Reasoning process","summary":" Abstract reasoning problems pose significant challenges to artificial\nintelligence algorithms, demanding cognitive capabilities beyond those required\nfor perception tasks. This study introduces the Triple-CFN approach to tackle\nthe Bongard-Logo problem, achieving notable reasoning accuracy by implicitly\nreorganizing the concept space of conflicting instances. Additionally, the\nTriple-CFN paradigm proves effective for the RPM problem with necessary\nmodifications, yielding competitive results. To further enhance performance on\nthe RPM issue, we develop the Meta Triple-CFN network, which explicitly\nstructures the problem space while maintaining interpretability on progressive\npatterns. The success of Meta Triple-CFN is attributed to its paradigm of\nmodeling the conceptual space, equivalent to normalizing reasoning information.\nBased on this ideology, we introduce the Re-space layer, enhancing the\nperformance of both Meta Triple-CFN and Triple-CFN. This paper aims to\ncontribute to advancements in machine intelligence by exploring innovative\nnetwork designs for addressing abstract reasoning problems, paving the way for\nfurther breakthroughs in this domain.\n","authors":["Ruizhuo Song","Beiming Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.03190v5.pdf","comment":"14 pages, 14 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.06865v1","updated":"2024-04-10T09:45:02Z","published":"2024-04-10T09:45:02Z","title":"Fine color guidance in diffusion models and its application to image\n compression at extremely low bitrates","summary":" This study addresses the challenge of, without training or fine-tuning,\ncontrolling the global color aspect of images generated with a diffusion model.\nWe rewrite the guidance equations to ensure that the outputs are closer to a\nknown color map, and this without hindering the quality of the generation. Our\nmethod leads to new guidance equations. We show in the color guidance context\nthat, the scaling of the guidance should not decrease but remains high\nthroughout the diffusion process. In a second contribution, our guidance is\napplied in a compression framework, we combine both semantic and general color\ninformation on the image to decode the images at low cost. We show that our\nmethod is effective at improving fidelity and realism of compressed images at\nextremely low bit rates, when compared to other classical or more semantic\noriented approaches.\n","authors":["Tom Bordin","Thomas Maugey"],"pdf_url":"https://arxiv.org/pdf/2404.06865v1.pdf","comment":"Submitted to IEEE Transactions on Image Processing (TIP)"},{"id":"http://arxiv.org/abs/2404.06863v1","updated":"2024-04-10T09:40:56Z","published":"2024-04-10T09:40:56Z","title":"RESSCAL3D: Resolution Scalable 3D Semantic Segmentation of Point Clouds","summary":" While deep learning-based methods have demonstrated outstanding results in\nnumerous domains, some important functionalities are missing. Resolution\nscalability is one of them. In this work, we introduce a novel architecture,\ndubbed RESSCAL3D, providing resolution-scalable 3D semantic segmentation of\npoint clouds. In contrast to existing works, the proposed method does not\nrequire the whole point cloud to be available to start inference. Once a\nlow-resolution version of the input point cloud is available, first semantic\npredictions can be generated in an extremely fast manner. This enables early\ndecision-making in subsequent processing steps. As additional points become\navailable, these are processed in parallel. To improve performance, features\nfrom previously computed scales are employed as prior knowledge at the current\nscale. Our experiments show that RESSCAL3D is 31-62% faster than the\nnon-scalable baseline while keeping a limited impact on performance. To the\nbest of our knowledge, the proposed method is the first to propose a\nresolution-scalable approach for 3D semantic segmentation of point clouds based\non deep learning.\n","authors":["Remco Royen","Adrian Munteanu"],"pdf_url":"https://arxiv.org/pdf/2404.06863v1.pdf","comment":"Published at 2023 IEEE International Conference on Image Processing\n (ICIP)"},{"id":"http://arxiv.org/abs/2404.06860v1","updated":"2024-04-10T09:35:50Z","published":"2024-04-10T09:35:50Z","title":"Monocular 3D lane detection for Autonomous Driving: Recent Achievements,\n Challenges, and Outlooks","summary":" 3D lane detection plays a crucial role in autonomous driving by extracting\nstructural and traffic information from the road in 3D space to assist the\nself-driving car in rational, safe, and comfortable path planning and motion\ncontrol. Due to the consideration of sensor costs and the advantages of visual\ndata in color information, in practical applications, 3D lane detection based\non monocular vision is one of the important research directions in the field of\nautonomous driving, which has attracted more and more attention in both\nindustry and academia. Unfortunately, recent progress in visual perception\nseems insufficient to develop completely reliable 3D lane detection algorithms,\nwhich also hinders the development of vision-based fully autonomous\nself-driving cars, i.e., achieving level 5 autonomous driving, driving like\nhuman-controlled cars. This is one of the conclusions drawn from this review\npaper: there is still a lot of room for improvement and significant\nimprovements are still needed in the 3D lane detection algorithm for autonomous\ndriving cars using visual sensors. Motivated by this, this review defines,\nanalyzes, and reviews the current achievements in the field of 3D lane\ndetection research, and the vast majority of the current progress relies\nheavily on computationally complex deep learning models. In addition, this\nreview covers the 3D lane detection pipeline, investigates the performance of\nstate-of-the-art algorithms, analyzes the time complexity of cutting-edge\nmodeling choices, and highlights the main achievements and limitations of\ncurrent research efforts. The survey also includes a comprehensive discussion\nof available 3D lane detection datasets and the challenges that researchers\nhave faced but have not yet resolved. Finally, our work outlines future\nresearch directions and welcomes researchers and practitioners to enter this\nexciting field.\n","authors":["Fulong Ma","Weiqing Qi","Guoyang Zhao","Linwei Zheng","Sheng Wang","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06859v1","updated":"2024-04-10T09:35:36Z","published":"2024-04-10T09:35:36Z","title":"Multi-Label Continual Learning for the Medical Domain: A Novel Benchmark","summary":" Multi-label image classification in dynamic environments is a problem that\nposes significant challenges. Previous studies have primarily focused on\nscenarios such as Domain Incremental Learning and Class Incremental Learning,\nwhich do not fully capture the complexity of real-world applications. In this\npaper, we study the problem of classification of medical imaging in the\nscenario termed New Instances \\& New Classes, which combines the challenges of\nboth new class arrivals and domain shifts in a single framework. Unlike\ntraditional scenarios, it reflects the realistic nature of CL in domains such\nas medical imaging, where updates may introduce both new classes and changes in\ndomain characteristics. To address the unique challenges posed by this complex\nscenario, we introduce a novel approach called Pseudo-Label Replay. This method\naims to mitigate forgetting while adapting to new classes and domain shifts by\ncombining the advantages of the Replay and Pseudo-Label methods and solving\ntheir limitations in the proposed scenario. % part3 We evaluate our proposed\napproach on a challenging benchmark consisting of two datasets, seven tasks,\nand nineteen classes, modeling a realistic Continual Learning scenario. Our\nexperimental findings demonstrate the effectiveness of Pseudo-Label Replay in\naddressing the challenges posed by the complex scenario proposed. Our method\nsurpasses existing approaches, exhibiting superior performance while showing\nminimal forgetting.\n","authors":["Marina Ceccon","Davide Dalle Pezze","Alessandro Fabris","Gian Antonio Susto"],"pdf_url":"https://arxiv.org/pdf/2404.06859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10035v3","updated":"2024-04-10T09:34:03Z","published":"2023-02-20T15:34:03Z","title":"Large-scale Multi-Modal Pre-trained Models: A Comprehensive Survey","summary":" With the urgent demand for generalized deep models, many pre-trained big\nmodels are proposed, such as BERT, ViT, GPT, etc. Inspired by the success of\nthese models in single domains (like computer vision and natural language\nprocessing), the multi-modal pre-trained big models have also drawn more and\nmore attention in recent years. In this work, we give a comprehensive survey of\nthese models and hope this paper could provide new insights and helps fresh\nresearchers to track the most cutting-edge works. Specifically, we firstly\nintroduce the background of multi-modal pre-training by reviewing the\nconventional deep learning, pre-training works in natural language process,\ncomputer vision, and speech. Then, we introduce the task definition, key\nchallenges, and advantages of multi-modal pre-training models (MM-PTMs), and\ndiscuss the MM-PTMs with a focus on data, objectives, network architectures,\nand knowledge enhanced pre-training. After that, we introduce the downstream\ntasks used for the validation of large-scale MM-PTMs, including generative,\nclassification, and regression tasks. We also give visualization and analysis\nof the model parameters and results on representative downstream tasks.\nFinally, we point out possible research directions for this topic that may\nbenefit future works. In addition, we maintain a continuously updated paper\nlist for large-scale pre-trained multi-modal big models:\nhttps://github.com/wangxiao5791509/MultiModal_BigModels_Survey. This paper has\nbeen published by the journal Machine Intelligence Research (MIR),\nhttps://link.springer.com/article/10.1007/s11633-022-1410-8, DOI:\n10.1007/s11633-022-1410-8, vol. 20, no. 4, pp. 447-482, 2023.\n","authors":["Xiao Wang","Guangyao Chen","Guangwu Qian","Pengcheng Gao","Xiao-Yong Wei","Yaowei Wang","Yonghong Tian","Wen Gao"],"pdf_url":"https://arxiv.org/pdf/2302.10035v3.pdf","comment":"Accepted by Machine Intelligence Research (MIR)"},{"id":"http://arxiv.org/abs/2404.06851v1","updated":"2024-04-10T09:24:54Z","published":"2024-04-10T09:24:54Z","title":"UDiFF: Generating Conditional Unsigned Distance Fields with Optimal\n Wavelet Diffusion","summary":" Diffusion models have shown remarkable results for image generation, editing\nand inpainting. Recent works explore diffusion models for 3D shape generation\nwith neural implicit functions, i.e., signed distance function and occupancy\nfunction. However, they are limited to shapes with closed surfaces, which\nprevents them from generating diverse 3D real-world contents containing open\nsurfaces. In this work, we present UDiFF, a 3D diffusion model for unsigned\ndistance fields (UDFs) which is capable to generate textured 3D shapes with\nopen surfaces from text conditions or unconditionally. Our key idea is to\ngenerate UDFs in spatial-frequency domain with an optimal wavelet\ntransformation, which produces a compact representation space for UDF\ngeneration. Specifically, instead of selecting an appropriate wavelet\ntransformation which requires expensive manual efforts and still leads to large\ninformation loss, we propose a data-driven approach to learn the optimal\nwavelet transformation for UDFs. We evaluate UDiFF to show our advantages by\nnumerical and visual comparisons with the latest methods on widely used\nbenchmarks. Page: https://weiqi-zhang.github.io/UDiFF.\n","authors":["Junsheng Zhou","Weiqi Zhang","Baorui Ma","Kanle Shi","Yu-Shen Liu","Zhizhong Han"],"pdf_url":"https://arxiv.org/pdf/2404.06851v1.pdf","comment":"To appear at CVPR2024. Project page:\n https://weiqi-zhang.github.io/UDiFF"},{"id":"http://arxiv.org/abs/2404.06842v1","updated":"2024-04-10T09:14:28Z","published":"2024-04-10T09:14:28Z","title":"MoCha-Stereo: Motif Channel Attention Network for Stereo Matching","summary":" Learning-based stereo matching techniques have made significant progress.\nHowever, existing methods inevitably lose geometrical structure information\nduring the feature channel generation process, resulting in edge detail\nmismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network\n(MoCha-Stereo) is designed to address this problem. We provide the Motif\nChannel Correlation Volume (MCCV) to determine more accurate edge matching\ncosts. MCCV is achieved by projecting motif channels, which capture common\ngeometric structures in feature channels, onto feature maps and cost volumes.\nIn addition, edge variations in %potential feature channels of the\nreconstruction error map also affect details matching, we propose the\nReconstruction Error Motif Penalty (REMP) module to further refine the\nfull-resolution disparity estimation. REMP integrates the frequency information\nof typical channel features from the reconstruction error. MoCha-Stereo ranks\n1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure\nalso shows excellent performance in Multi-View Stereo. Code is avaliable at\nhttps://github.com/ZYangChen/MoCha-Stereo.\n","authors":["Ziyang Chen","Wei Long","He Yao","Yongjun Zhang","Bingshu Wang","Yongbin Qin","Jia Wu"],"pdf_url":"https://arxiv.org/pdf/2404.06842v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2402.02263v2","updated":"2024-04-10T09:00:44Z","published":"2024-02-03T21:12:36Z","title":"MixedNUTS: Training-Free Accuracy-Robustness Balance via Nonlinearly\n Mixed Classifiers","summary":" Adversarial robustness often comes at the cost of degraded accuracy, impeding\nthe real-life application of robust classification models. Training-based\nsolutions for better trade-offs are limited by incompatibilities with\nalready-trained high-performance large models, necessitating the exploration of\ntraining-free ensemble approaches. Observing that robust models are more\nconfident in correct predictions than in incorrect ones on clean and\nadversarial data alike, we speculate amplifying this \"benign confidence\nproperty\" can reconcile accuracy and robustness in an ensemble setting. To\nachieve so, we propose \"MixedNUTS\", a training-free method where the output\nlogits of a robust classifier and a standard non-robust classifier are\nprocessed by nonlinear transformations with only three parameters, which are\noptimized through an efficient algorithm. MixedNUTS then converts the\ntransformed logits into probabilities and mixes them as the overall output. On\nCIFAR-10, CIFAR-100, and ImageNet datasets, experimental results with custom\nstrong adaptive attacks demonstrate MixedNUTS's vastly improved accuracy and\nnear-SOTA robustness -- it boosts CIFAR-100 clean accuracy by 7.86 points,\nsacrificing merely 0.87 points in robust accuracy.\n","authors":["Yatong Bai","Mo Zhou","Vishal M. Patel","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2402.02263v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06836v1","updated":"2024-04-10T08:54:43Z","published":"2024-04-10T08:54:43Z","title":"O2V-Mapping: Online Open-Vocabulary Mapping with Neural Implicit\n Representation","summary":" Online construction of open-ended language scenes is crucial for robotic\napplications, where open-vocabulary interactive scene understanding is\nrequired. Recently, neural implicit representation has provided a promising\ndirection for online interactive mapping. However, implementing open-vocabulary\nscene understanding capability into online neural implicit mapping still faces\nthree challenges: lack of local scene updating ability, blurry spatial\nhierarchical semantic segmentation and difficulty in maintaining multi-view\nconsistency. To this end, we proposed O2V-mapping, which utilizes voxel-based\nlanguage and geometric features to create an open-vocabulary field, thus\nallowing for local updates during online training process. Additionally, we\nleverage a foundational model for image segmentation to extract language\nfeatures on object-level entities, achieving clear segmentation boundaries and\nhierarchical semantic features. For the purpose of preserving consistency in 3D\nobject properties across different viewpoints, we propose a spatial adaptive\nvoxel adjustment mechanism and a multi-view weight selection method. Extensive\nexperiments on open-vocabulary object localization and semantic segmentation\ndemonstrate that O2V-mapping achieves online construction of language scenes\nwhile enhancing accuracy, outperforming the previous SOTA method.\n","authors":["Muer Tie","Julong Wei","Zhengjun Wang","Ke Wu","Shansuai Yuan","Kaizhao Zhang","Jie Jia","Jieru Zhao","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2404.06836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06835v1","updated":"2024-04-10T08:54:00Z","published":"2024-04-10T08:54:00Z","title":"Tuning-Free Adaptive Style Incorporation for Structure-Consistent\n Text-Driven Style Transfer","summary":" In this work, we target the task of text-driven style transfer in the context\nof text-to-image (T2I) diffusion models. The main challenge is consistent\nstructure preservation while enabling effective style transfer effects. The\npast approaches in this field directly concatenate the content and style\nprompts for a prompt-level style injection, leading to unavoidable structure\ndistortions. In this work, we propose a novel solution to the text-driven style\ntransfer task, namely, Adaptive Style Incorporation~(ASI), to achieve\nfine-grained feature-level style incorporation. It consists of the Siamese\nCross-Attention~(SiCA) to decouple the single-track cross-attention to a\ndual-track structure to obtain separate content and style features, and the\nAdaptive Content-Style Blending (AdaBlending) module to couple the content and\nstyle information from a structure-consistent manner. Experimentally, our\nmethod exhibits much better performance in both structure preservation and\nstylized effects.\n","authors":["Yanqi Ge","Jiaqi Liu","Qingnan Fan","Xi Jiang","Ye Huang","Shuai Qin","Hong Gu","Wen Li","Lixin Duan"],"pdf_url":"https://arxiv.org/pdf/2404.06835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06832v1","updated":"2024-04-10T08:48:09Z","published":"2024-04-10T08:48:09Z","title":"SplatPose & Detect: Pose-Agnostic 3D Anomaly Detection","summary":" Detecting anomalies in images has become a well-explored problem in both\nacademia and industry. State-of-the-art algorithms are able to detect defects\nin increasingly difficult settings and data modalities. However, most current\nmethods are not suited to address 3D objects captured from differing poses.\nWhile solutions using Neural Radiance Fields (NeRFs) have been proposed, they\nsuffer from excessive computation requirements, which hinder real-world\nusability. For this reason, we propose the novel 3D Gaussian splatting-based\nframework SplatPose which, given multi-view images of a 3D object, accurately\nestimates the pose of unseen views in a differentiable manner, and detects\nanomalies in them. We achieve state-of-the-art results in both training and\ninference speed, and detection performance, even when using less training data\nthan competing methods. We thoroughly evaluate our framework using the recently\nproposed Pose-agnostic Anomaly Detection benchmark and its multi-pose anomaly\ndetection (MAD) data set.\n","authors":["Mathis Kruse","Marco Rudolph","Dominik Woiwode","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2404.06832v1.pdf","comment":"Visual Anomaly and Novelty Detection 2.0 Workshop at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02668v2","updated":"2024-04-10T08:47:32Z","published":"2024-04-03T12:06:01Z","title":"RS-Mamba for Large Remote Sensing Image Dense Prediction","summary":" Context modeling is critical for remote sensing image dense prediction tasks.\nNowadays, the growing size of very-high-resolution (VHR) remote sensing images\nposes challenges in effectively modeling context. While transformer-based\nmodels possess global modeling capabilities, they encounter computational\nchallenges when applied to large VHR images due to their quadratic complexity.\nThe conventional practice of cropping large images into smaller patches results\nin a notable loss of contextual information. To address these issues, we\npropose the Remote Sensing Mamba (RSM) for dense prediction tasks in large VHR\nremote sensing images. RSM is specifically designed to capture the global\ncontext of remote sensing images with linear complexity, facilitating the\neffective processing of large VHR images. Considering that the land covers in\nremote sensing images are distributed in arbitrary spatial directions due to\ncharacteristics of remote sensing over-head imaging, the RSM incorporates an\nomnidirectional selective scan module to globally model the context of images\nin multiple directions, capturing large spatial features from various\ndirections. Extensive experiments on semantic segmentation and change detection\ntasks across various land covers demonstrate the effectiveness of the proposed\nRSM. We designed simple yet effective models based on RSM, achieving\nstate-of-the-art performance on dense prediction tasks in VHR remote sensing\nimages without fancy training strategies. Leveraging the linear complexity and\nglobal modeling capabilities, RSM achieves better efficiency and accuracy than\ntransformer-based models on large remote sensing images. Interestingly, we also\ndemonstrated that our model generally performs better with a larger image size\non dense prediction tasks. Our code is available at\nhttps://github.com/walking-shadow/Official_Remote_Sensing_Mamba.\n","authors":["Sijie Zhao","Hao Chen","Xueliang Zhang","Pengfeng Xiao","Lei Bai","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2404.02668v2.pdf","comment":"15 pages,8 figures"},{"id":"http://arxiv.org/abs/2312.03502v2","updated":"2024-04-10T08:29:23Z","published":"2023-12-06T13:59:22Z","title":"Improving the Generalization of Segmentation Foundation Model under\n Distribution Shift via Weakly Supervised Adaptation","summary":" The success of large language models has inspired the computer vision\ncommunity to explore image segmentation foundation model that is able to\nzero/few-shot generalize through prompt engineering. Segment-Anything(SAM),\namong others, is the state-of-the-art image segmentation foundation model\ndemonstrating strong zero/few-shot generalization. Despite the success, recent\nstudies reveal the weakness of SAM under strong distribution shift. In\nparticular, SAM performs awkwardly on corrupted natural images, camouflaged\nimages, medical images, etc. Motivated by the observations, we aim to develop a\nself-training based strategy to adapt SAM to target distribution. Given the\nunique challenges of large source dataset, high computation cost and incorrect\npseudo label, we propose a weakly supervised self-training architecture with\nanchor regularization and low-rank finetuning to improve the robustness and\ncomputation efficiency of adaptation. We validate the effectiveness on 5 types\nof downstream segmentation tasks including natural clean/corrupted images,\nmedical images, camouflaged images and robotic images. Our proposed method is\ntask-agnostic in nature and outperforms pre-trained SAM and state-of-the-art\ndomain adaptation methods on almost all downstream tasks with the same testing\nprompt inputs.\n","authors":["Haojie Zhang","Yongyi Su","Xun Xu","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2312.03502v2.pdf","comment":"20 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.10610v4","updated":"2024-04-10T08:16:18Z","published":"2023-08-21T10:20:46Z","title":"Ear-Keeper: Real-time Diagnosis of Ear Lesions Utilizing\n Ultralight-Ultrafast ConvNet and Large-scale Ear Endoscopic Dataset","summary":" Deep learning-based ear disease diagnosis technology has proven effective and\naffordable. However, due to the lack of ear endoscope datasets with diversity,\nthe practical potential of the deep learning model has not been thoroughly\nstudied. Moreover, existing research failed to achieve a good trade-off between\nmodel inference speed and parameter size, rendering models inapplicable in\nreal-world settings. To address these challenges, we constructed the first\nlarge-scale ear endoscopic dataset comprising eight types of ear diseases and\ndisease-free samples from two institutions. Inspired by ShuffleNetV2, we\nproposed Best-EarNet, an ultrafast and ultralight network enabling real-time\near disease diagnosis. Best-EarNet incorporates a novel Local-Global Spatial\nFeature Fusion Module and multi-scale supervision strategy, which facilitates\nthe model focusing on global-local information within feature maps at various\nlevels. Utilizing transfer learning, the accuracy of Best-EarNet with only\n0.77M parameters achieves 95.23% (internal 22,581 images) and 92.14% (external\n1,652 images), respectively. In particular, it achieves an average frame per\nsecond of 80 on the CPU. From the perspective of model practicality, the\nproposed Best-EarNet is superior to state-of-the-art backbone models in ear\nlesion detection tasks. Most importantly, Ear-keeper, an intelligent diagnosis\nsystem based Best-EarNet, was developed successfully and deployed on common\nelectronic devices (smartphone, tablet computer and personal computer). In the\nfuture, Ear-Keeper has the potential to assist the public and healthcare\nproviders in performing comprehensive scanning and diagnosis of the ear canal\nin real-time video, thereby promptly detecting ear lesions.\n","authors":["Yubiao Yue","Xinyu Zeng","Xiaoqiang Shi","Meiping Zhang","Fan Zhang","Yunxin Liang","Yan Liu","Zhenzhang Li","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2308.10610v4.pdf","comment":"18 pages,8 figures"},{"id":"http://arxiv.org/abs/2404.06814v1","updated":"2024-04-10T08:02:17Z","published":"2024-04-10T08:02:17Z","title":"Zero-shot Point Cloud Completion Via 2D Priors","summary":" 3D point cloud completion is designed to recover complete shapes from\npartially observed point clouds. Conventional completion methods typically\ndepend on extensive point cloud data for training %, with their effectiveness\noften constrained to object categories similar to those seen during training.\nIn contrast, we propose a zero-shot framework aimed at completing partially\nobserved point clouds across any unseen categories. Leveraging point rendering\nvia Gaussian Splatting, we develop techniques of Point Cloud Colorization and\nZero-shot Fractal Completion that utilize 2D priors from pre-trained diffusion\nmodels to infer missing regions. Experimental results on both synthetic and\nreal-world scanned point clouds demonstrate that our approach outperforms\nexisting methods in completing a variety of objects without any requirement for\nspecific training data.\n","authors":["Tianxin Huang","Zhiwen Yan","Yuyang Zhao","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.06814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05916v2","updated":"2024-04-10T07:58:44Z","published":"2024-03-09T13:56:25Z","title":"GPT as Psychologist? Preliminary Evaluations for GPT-4V on Visual\n Affective Computing","summary":" Multimodal large language models (MLLMs) are designed to process and\nintegrate information from multiple sources, such as text, speech, images, and\nvideos. Despite its success in language understanding, it is critical to\nevaluate the performance of downstream tasks for better human-centric\napplications. This paper assesses the application of MLLMs with 5 crucial\nabilities for affective computing, spanning from visual affective tasks and\nreasoning tasks. The results show that \\gpt has high accuracy in facial action\nunit recognition and micro-expression detection while its general facial\nexpression recognition performance is not accurate. We also highlight the\nchallenges of achieving fine-grained micro-expression recognition and the\npotential for further study and demonstrate the versatility and potential of\n\\gpt for handling advanced tasks in emotion recognition and related fields by\nintegrating with task-related agents for more complex tasks, such as heart rate\nestimation through signal processing. In conclusion, this paper provides\nvaluable insights into the potential applications and challenges of MLLMs in\nhuman-centric computing. Our interesting examples are at\nhttps://github.com/EnVision-Research/GPT4Affectivity.\n","authors":["Hao Lu","Xuesong Niu","Jiyao Wang","Yin Wang","Qingyong Hu","Jiaqi Tang","Yuting Zhang","Kaishen Yuan","Bin Huang","Zitong Yu","Dengbo He","Shuiguang Deng","Hao Chen","Yingcong Chen","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2403.05916v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08551v3","updated":"2024-04-10T07:58:04Z","published":"2024-03-13T14:02:54Z","title":"GaussianImage: 1000 FPS Image Representation and Compression by 2D\n Gaussian Splatting","summary":" Implicit neural representations (INRs) recently achieved great success in\nimage representation and compression, offering high visual quality and fast\nrendering speeds with 10-1000 FPS, assuming sufficient GPU resources are\navailable. However, this requirement often hinders their use on low-end devices\nwith limited memory. In response, we propose a groundbreaking paradigm of image\nrepresentation and compression by 2D Gaussian Splatting, named GaussianImage.\nWe first introduce 2D Gaussian to represent the image, where each Gaussian has\n8 parameters including position, covariance and color. Subsequently, we unveil\na novel rendering algorithm based on accumulated summation. Remarkably, our\nmethod with a minimum of 3$\\times$ lower GPU memory usage and 5$\\times$ faster\nfitting time not only rivals INRs (e.g., WIRE, I-NGP) in representation\nperformance, but also delivers a faster rendering speed of 1500-2000 FPS\nregardless of parameter size. Furthermore, we integrate existing vector\nquantization technique to build an image codec. Experimental results\ndemonstrate that our codec attains rate-distortion performance comparable to\ncompression-based INRs such as COIN and COIN++, while facilitating decoding\nspeeds of approximately 1000 FPS. Additionally, preliminary proof of concept\nshows that our codec surpasses COIN and COIN++ in performance when using\npartial bits-back coding. Code will be available at\nhttps://github.com/Xinjie-Q/GaussianImage.\n","authors":["Xinjie Zhang","Xingtong Ge","Tongda Xu","Dailan He","Yan Wang","Hongwei Qin","Guo Lu","Jing Geng","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.08551v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07354v4","updated":"2024-04-10T07:54:14Z","published":"2024-02-12T01:03:39Z","title":"Re-DiffiNet: Modeling discrepancies in tumor segmentation using\n diffusion models","summary":" Identification of tumor margins is essential for surgical decision-making for\nglioblastoma patients and provides reliable assistance for neurosurgeons.\nDespite improvements in deep learning architectures for tumor segmentation over\nthe years, creating a fully autonomous system suitable for clinical floors\nremains a formidable challenge because the model predictions have not yet\nreached the desired level of accuracy and generalizability for clinical\napplications. Generative modeling techniques have seen significant improvements\nin recent times. Specifically, Generative Adversarial Networks (GANs) and\nDenoising-diffusion-based models (DDPMs) have been used to generate\nhigher-quality images with fewer artifacts and finer attributes. In this work,\nwe introduce a framework called Re-Diffinet for modeling the discrepancy\nbetween the outputs of a segmentation model like U-Net and the ground truth,\nusing DDPMs. By explicitly modeling the discrepancy, the results show an\naverage improvement of 0.55\\% in the Dice score and 16.28\\% in HD95 from\ncross-validation over 5-folds, compared to the state-of-the-art U-Net\nsegmentation model.\n","authors":["Tianyi Ren","Abhishek Sharma","Juampablo Heras Rivera","Harshitha Rebala","Ethan Honey","Agamdeep Chopra","Jacob Ruzevick","Mehmet Kurt"],"pdf_url":"https://arxiv.org/pdf/2402.07354v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05063v2","updated":"2024-04-10T07:44:40Z","published":"2024-04-07T20:19:04Z","title":"AUEditNet: Dual-Branch Facial Action Unit Intensity Manipulation with\n Implicit Disentanglement","summary":" Facial action unit (AU) intensity plays a pivotal role in quantifying\nfine-grained expression behaviors, which is an effective condition for facial\nexpression manipulation. However, publicly available datasets containing\nintensity annotations for multiple AUs remain severely limited, often featuring\na restricted number of subjects. This limitation places challenges to the AU\nintensity manipulation in images due to disentanglement issues, leading\nresearchers to resort to other large datasets with pretrained AU intensity\nestimators for pseudo labels. In addressing this constraint and fully\nleveraging manual annotations of AU intensities for precise manipulation, we\nintroduce AUEditNet. Our proposed model achieves impressive intensity\nmanipulation across 12 AUs, trained effectively with only 18 subjects.\nUtilizing a dual-branch architecture, our approach achieves comprehensive\ndisentanglement of facial attributes and identity without necessitating\nadditional loss functions or implementing with large batch sizes. This approach\noffers a potential solution to achieve desired facial attribute editing despite\nthe dataset's limited subject count. Our experiments demonstrate AUEditNet's\nsuperior accuracy in editing AU intensities, affirming its capability in\ndisentangling facial attributes and identity within a limited subject pool.\nAUEditNet allows conditioning by either intensity values or target images,\neliminating the need for constructing AU combinations for specific facial\nexpression synthesis. Moreover, AU intensity estimation, as a downstream task,\nvalidates the consistency between real and edited images, confirming the\neffectiveness of our proposed AU intensity manipulation method.\n","authors":["Shiwei Jin","Zhen Wang","Lei Wang","Peng Liu","Ning Bi","Truong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05063v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06798v1","updated":"2024-04-10T07:41:35Z","published":"2024-04-10T07:41:35Z","title":"MedRG: Medical Report Grounding with Multi-modal Large Language Model","summary":" Medical Report Grounding is pivotal in identifying the most relevant regions\nin medical images based on a given phrase query, a critical aspect in medical\nimage analysis and radiological diagnosis. However, prevailing visual grounding\napproaches necessitate the manual extraction of key phrases from medical\nreports, imposing substantial burdens on both system efficiency and physicians.\nIn this paper, we introduce a novel framework, Medical Report Grounding\n(MedRG), an end-to-end solution for utilizing a multi-modal Large Language\nModel to predict key phrase by incorporating a unique token, BOX, into the\nvocabulary to serve as an embedding for unlocking detection capabilities.\nSubsequently, the vision encoder-decoder jointly decodes the hidden embedding\nand the input medical image, generating the corresponding grounding box. The\nexperimental results validate the effectiveness of MedRG, surpassing the\nperformance of the existing state-of-the-art medical phrase grounding methods.\nThis study represents a pioneering exploration of the medical report grounding\ntask, marking the first-ever endeavor in this domain.\n","authors":["Ke Zou","Yang Bai","Zhihao Chen","Yang Zhou","Yidi Chen","Kai Ren","Meng Wang","Xuedong Yuan","Xiaojing Shen","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2404.06798v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.15361v2","updated":"2024-04-10T06:46:08Z","published":"2023-11-26T17:27:26Z","title":"Ultra-Range Gesture Recognition using a Web-Camera in Human-Robot\n Interaction","summary":" Hand gestures play a significant role in human interactions where non-verbal\nintentions, thoughts and commands are conveyed. In Human-Robot Interaction\n(HRI), hand gestures offer a similar and efficient medium for conveying clear\nand rapid directives to a robotic agent. However, state-of-the-art vision-based\nmethods for gesture recognition have been shown to be effective only up to a\nuser-camera distance of seven meters. Such a short distance range limits\npractical HRI with, for example, service robots, search and rescue robots and\ndrones. In this work, we address the Ultra-Range Gesture Recognition (URGR)\nproblem by aiming for a recognition distance of up to 25 meters and in the\ncontext of HRI. We propose the URGR framework, a novel deep-learning, using\nsolely a simple RGB camera. Gesture inference is based on a single image.\nFirst, a novel super-resolution model termed High-Quality Network (HQ-Net) uses\na set of self-attention and convolutional layers to enhance the low-resolution\nimage of the user. Then, we propose a novel URGR classifier termed Graph Vision\nTransformer (GViT) which takes the enhanced image as input. GViT combines the\nbenefits of a Graph Convolutional Network (GCN) and a modified Vision\nTransformer (ViT). Evaluation of the proposed framework over diverse test data\nyields a high recognition rate of 98.1%. The framework has also exhibited\nsuperior performance compared to human recognition in ultra-range distances.\nWith the framework, we analyze and demonstrate the performance of an autonomous\nquadruped robot directed by human gestures in complex ultra-range indoor and\noutdoor environments, acquiring 96% recognition rate on average.\n","authors":["Eran Bamani","Eden Nissinman","Inbar Meir","Lisa Koenigsberg","Avishai Sintov"],"pdf_url":"https://arxiv.org/pdf/2311.15361v2.pdf","comment":"Engineering Applications of Artificial Intelligence, In press"},{"id":"http://arxiv.org/abs/2404.06780v1","updated":"2024-04-10T06:41:30Z","published":"2024-04-10T06:41:30Z","title":"Urban Architect: Steerable 3D Urban Scene Generation with Layout Prior","summary":" Text-to-3D generation has achieved remarkable success via large-scale\ntext-to-image diffusion models. Nevertheless, there is no paradigm for scaling\nup the methodology to urban scale. Urban scenes, characterized by numerous\nelements, intricate arrangement relationships, and vast scale, present a\nformidable barrier to the interpretability of ambiguous textual descriptions\nfor effective model optimization. In this work, we surmount the limitations by\nintroducing a compositional 3D layout representation into text-to-3D paradigm,\nserving as an additional prior. It comprises a set of semantic primitives with\nsimple geometric structures and explicit arrangement relationships,\ncomplementing textual descriptions and enabling steerable generation. Upon\nthis, we propose two modifications -- (1) We introduce Layout-Guided\nVariational Score Distillation to address model optimization inadequacies. It\nconditions the score distillation sampling process with geometric and semantic\nconstraints of 3D layouts. (2) To handle the unbounded nature of urban scenes,\nwe represent 3D scene with a Scalable Hash Grid structure, incrementally\nadapting to the growing scale of urban scenes. Extensive experiments\nsubstantiate the capability of our framework to scale text-to-3D generation to\nlarge-scale urban scenes that cover over 1000m driving distance for the first\ntime. We also present various scene editing demonstrations, showing the powers\nof steerable urban scene generation. Website: https://urbanarchitect.github.io.\n","authors":["Fan Lu","Kwan-Yee Lin","Yan Xu","Hongsheng Li","Guang Chen","Changjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.06780v1.pdf","comment":"Project page: https://urbanarchitect.github.io/"},{"id":"http://arxiv.org/abs/2404.06779v1","updated":"2024-04-10T06:39:18Z","published":"2024-04-10T06:39:18Z","title":"Efficient and Scalable Chinese Vector Font Generation via Component\n Composition","summary":" Chinese vector font generation is challenging due to the complex structure\nand huge amount of Chinese characters. Recent advances remain limited to\ngenerating a small set of characters with simple structure. In this work, we\nfirst observe that most Chinese characters can be disassembled into\nfrequently-reused components. Therefore, we introduce the first efficient and\nscalable Chinese vector font generation approach via component composition,\nallowing generating numerous vector characters from a small set of components.\nTo achieve this, we collect a large-scale dataset that contains over\n\\textit{90K} Chinese characters with their components and layout information.\nUpon the dataset, we propose a simple yet effective framework based on spatial\ntransformer networks (STN) and multiple losses tailored to font characteristics\nto learn the affine transformation of the components, which can be directly\napplied to the B\\'ezier curves, resulting in Chinese characters in vector\nformat. Our qualitative and quantitative experiments have demonstrated that our\nmethod significantly surpasses the state-of-the-art vector font generation\nmethods in generating large-scale complex Chinese characters in both font\ngeneration and zero-shot font extension.\n","authors":["Jinyu Song","Weitao You","Shuhui Shi","Shuxuan Guo","Lingyun Sun","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06779v1.pdf","comment":"15 pages, 23 figures"},{"id":"http://arxiv.org/abs/2404.06776v1","updated":"2024-04-10T06:35:25Z","published":"2024-04-10T06:35:25Z","title":"Logit Calibration and Feature Contrast for Robust Federated Learning on\n Non-IID Data","summary":" Federated learning (FL) is a privacy-preserving distributed framework for\ncollaborative model training on devices in edge networks. However, challenges\narise due to vulnerability to adversarial examples (AEs) and the\nnon-independent and identically distributed (non-IID) nature of data\ndistribution among devices, hindering the deployment of adversarially robust\nand accurate learning models at the edge. While adversarial training (AT) is\ncommonly acknowledged as an effective defense strategy against adversarial\nattacks in centralized training, we shed light on the adverse effects of\ndirectly applying AT in FL that can severely compromise accuracy, especially in\nnon-IID challenges. Given this limitation, this paper proposes FatCC, which\nincorporates local logit \\underline{C}alibration and global feature\n\\underline{C}ontrast into the vanilla federated adversarial training\n(\\underline{FAT}) process from both logit and feature perspectives. This\napproach can effectively enhance the federated system's robust accuracy (RA)\nand clean accuracy (CA). First, we propose logit calibration, where the logits\nare calibrated during local adversarial updates, thereby improving adversarial\nrobustness. Second, FatCC introduces feature contrast, which involves a global\nalignment term that aligns each local representation with unbiased global\nfeatures, thus further enhancing robustness and accuracy in federated\nadversarial environments. Extensive experiments across multiple datasets\ndemonstrate that FatCC achieves comparable or superior performance gains in\nboth CA and RA compared to other baselines.\n","authors":["Yu Qiao","Chaoning Zhang","Apurba Adhikary","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2404.06776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06773v1","updated":"2024-04-10T06:30:08Z","published":"2024-04-10T06:30:08Z","title":"Adapting LLaMA Decoder to Vision Transformer","summary":" This work examines whether decoder-only Transformers such as LLaMA, which\nwere originally designed for large language models (LLMs), can be adapted to\nthe computer vision field. We first \"LLaMAfy\" a standard ViT step-by-step to\nalign with LLaMA's architecture, and find that directly applying a casual mask\nto the self-attention brings an attention collapse issue, resulting in the\nfailure to the network training. We suggest to reposition the class token\nbehind the image tokens with a post-sequence class token technique to overcome\nthis challenge, enabling causal self-attention to efficiently capture the\nentire image's information. Additionally, we develop a soft mask strategy that\ngradually introduces a casual mask to the self-attention at the onset of\ntraining to facilitate the optimization behavior. The tailored model, dubbed as\nimage LLaMA (iLLaMA), is akin to LLaMA in architecture and enables direct\nsupervised learning. Its causal self-attention boosts computational efficiency\nand learns complex representation by elevating attention map ranks. iLLaMA\nrivals the performance with its encoder-only counterparts, achieving 75.1%\nImageNet top-1 accuracy with only 5.7M parameters. Scaling the model to ~310M\nand pre-training on ImageNet-21K further enhances the accuracy to 86.0%.\nExtensive experiments demonstrate iLLaMA's reliable properties: calibration,\nshape-texture bias, quantization compatibility, ADE20K segmentation and CIFAR\ntransfer learning. We hope our study can kindle fresh views to visual model\ndesign in the wave of LLMs. Pre-trained models and codes are available here.\n","authors":["Jiahao Wang","Wenqi Shao","Mengzhao Chen","Chengyue Wu","Yong Liu","Kaipeng Zhang","Songyang Zhang","Kai Chen","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2404.06773v1.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.06753v1","updated":"2024-04-10T05:41:05Z","published":"2024-04-10T05:41:05Z","title":"MonoSelfRecon: Purely Self-Supervised Explicit Generalizable 3D\n Reconstruction of Indoor Scenes from Monocular RGB Views","summary":" Current monocular 3D scene reconstruction (3DR) works are either\nfully-supervised, or not generalizable, or implicit in 3D representation. We\npropose a novel framework - MonoSelfRecon that for the first time achieves\nexplicit 3D mesh reconstruction for generalizable indoor scenes with monocular\nRGB views by purely self-supervision on voxel-SDF (signed distance function).\nMonoSelfRecon follows an Autoencoder-based architecture, decodes voxel-SDF and\na generalizable Neural Radiance Field (NeRF), which is used to guide voxel-SDF\nin self-supervision. We propose novel self-supervised losses, which not only\nsupport pure self-supervision, but can be used together with supervised signals\nto further boost supervised training. Our experiments show that \"MonoSelfRecon\"\ntrained in pure self-supervision outperforms current best self-supervised\nindoor depth estimation models and is comparable to 3DR models trained in fully\nsupervision with depth annotations. MonoSelfRecon is not restricted by specific\nmodel design, which can be used to any models with voxel-SDF for purely\nself-supervised manner.\n","authors":["Runfa Li","Upal Mahbub","Vasudev Bhaskaran","Truong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.06753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06744v1","updated":"2024-04-10T05:10:05Z","published":"2024-04-10T05:10:05Z","title":"YOLO based Ocean Eddy Localization with AWS SageMaker","summary":" Ocean eddies play a significant role both on the sea surface and beneath it,\ncontributing to the sustainability of marine life dependent on oceanic\nbehaviors. Therefore, it is crucial to investigate ocean eddies to monitor\nchanges in the Earth, particularly in the oceans, and their impact on climate.\nThis study aims to pinpoint ocean eddies using AWS cloud services, specifically\nSageMaker. The primary objective is to detect small-scale (<20km) ocean eddies\nfrom satellite remote images and assess the feasibility of utilizing SageMaker,\nwhich offers tools for deploying AI applications. Moreover, this research not\nonly explores the deployment of cloud-based services for remote sensing of\nEarth data but also evaluates several YOLO (You Only Look Once) models using\nsingle and multi-GPU-based services in the cloud. Furthermore, this study\nunderscores the potential of these services, their limitations, challenges\nrelated to deployment and resource management, and their user-riendliness for\nEarth science projects.\n","authors":["Seraj Al Mahmud Mostafa","Jinbo Wang","Benjamin Holt","Jianwu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06744v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.06741v1","updated":"2024-04-10T04:59:51Z","published":"2024-04-10T04:59:51Z","title":"An Animation-based Augmentation Approach for Action Recognition from\n Discontinuous Video","summary":" The study of action recognition has attracted considerable attention recently\ndue to its broad applications in multiple areas. However, with the issue of\ndiscontinuous training video, which not only decreases the performance of\naction recognition model, but complicates the data augmentation process as\nwell, still remains under-exploration. In this study, we introduce the 4A\n(Action Animation-based Augmentation Approach), an innovative pipeline for data\naugmentation to address the problem. The main contributions remain in our work\nincludes: (1) we investigate the problem of severe decrease on performance of\naction recognition task training by discontinuous video, and the limitation of\nexisting augmentation methods on solving this problem. (2) we propose a novel\naugmentation pipeline, 4A, to address the problem of discontinuous video for\ntraining, while achieving a smoother and natural-looking action representation\nthan the latest data augmentation methodology. (3) We achieve the same\nperformance with only 10% of the original data for training as with all of the\noriginal data from the real-world dataset, and a better performance on\nIn-the-wild videos, by employing our data augmentation techniques.\n","authors":["Xingyu Song","Zhan Li","Shi Chen","Xin-Qiang Cai","Kazuyuki Demachi"],"pdf_url":"https://arxiv.org/pdf/2404.06741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.02736v4","updated":"2024-04-10T04:51:33Z","published":"2022-11-04T20:22:58Z","title":"Discovering Closed-Loop Failures of Vision-Based Controllers via\n Reachability Analysis","summary":" Machine learning driven image-based controllers allow robotic systems to take\nintelligent actions based on the visual feedback from their environment.\nUnderstanding when these controllers might lead to system safety violations is\nimportant for their integration in safety-critical applications and engineering\ncorrective safety measures for the system. Existing methods leverage\nsimulation-based testing (or falsification) to find the failures of\nvision-based controllers, i.e., the visual inputs that lead to closed-loop\nsafety violations. However, these techniques do not scale well to the scenarios\ninvolving high-dimensional and complex visual inputs, such as RGB images. In\nthis work, we cast the problem of finding closed-loop vision failures as a\nHamilton-Jacobi (HJ) reachability problem. Our approach blends simulation-based\nanalysis with HJ reachability methods to compute an approximation of the\nbackward reachable tube (BRT) of the system, i.e., the set of unsafe states for\nthe system under vision-based controllers. Utilizing the BRT, we can tractably\nand systematically find the system states and corresponding visual inputs that\nlead to closed-loop failures. These visual inputs can be subsequently analyzed\nto find the input characteristics that might have caused the failure. Besides\nits scalability to high-dimensional visual inputs, an explicit computation of\nBRT allows the proposed approach to capture non-trivial system failures that\nare difficult to expose via random simulations. We demonstrate our framework on\ntwo case studies involving an RGB image-based neural network controller for (a)\nautonomous indoor navigation, and (b) autonomous aircraft taxiing.\n","authors":["Kaustav Chakraborty","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2211.02736v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01289v3","updated":"2024-04-10T04:42:10Z","published":"2023-06-02T06:15:36Z","title":"nnMobileNe: Rethinking CNN for Retinopathy Research","summary":" Over the past few decades, convolutional neural networks (CNNs) have been at\nthe forefront of the detection and tracking of various retinal diseases (RD).\nDespite their success, the emergence of vision transformers (ViT) in the 2020s\nhas shifted the trajectory of RD model development. The leading-edge\nperformance of ViT-based models in RD can be largely credited to their\nscalability-their ability to improve as more parameters are added. As a result,\nViT-based models tend to outshine traditional CNNs in RD applications, albeit\nat the cost of increased data and computational demands. ViTs also differ from\nCNNs in their approach to processing images, working with patches rather than\nlocal regions, which can complicate the precise localization of small, variably\npresented lesions in RD. In our study, we revisited and updated the\narchitecture of a CNN model, specifically MobileNet, to enhance its utility in\nRD diagnostics. We found that an optimized MobileNet, through selective\nmodifications, can surpass ViT-based models in various RD benchmarks, including\ndiabetic retinopathy grading, detection of multiple fundus diseases, and\nclassification of diabetic macular edema. The code is available at\nhttps://github.com/Retinal-Research/NN-MOBILENET\n","authors":["Wenhui Zhu","Peijie Qiu","Xiwen Chen","Xin Li","Natasha Lepore","Oana M. Dumitrascu","Yalin Wang"],"pdf_url":"https://arxiv.org/pdf/2306.01289v3.pdf","comment":"Accepted as a conference paper to 2024 CVPRW"},{"id":"http://arxiv.org/abs/2404.06727v1","updated":"2024-04-10T04:24:42Z","published":"2024-04-10T04:24:42Z","title":"Bayesian NeRF: Quantifying Uncertainty with Volume Density in Neural\n Radiance Fields","summary":" We present the Bayesian Neural Radiance Field (NeRF), which explicitly\nquantifies uncertainty in geometric volume structures without the need for\nadditional networks, making it adept for challenging observations and\nuncontrolled images. NeRF diverges from traditional geometric methods by\noffering an enriched scene representation, rendering color and density in 3D\nspace from various viewpoints. However, NeRF encounters limitations in relaxing\nuncertainties by using geometric structure information, leading to inaccuracies\nin interpretation under insufficient real-world observations. Recent research\nefforts aimed at addressing this issue have primarily relied on empirical\nmethods or auxiliary networks. To fundamentally address this issue, we propose\na series of formulational extensions to NeRF. By introducing generalized\napproximations and defining density-related uncertainty, our method seamlessly\nextends to manage uncertainty not only for RGB but also for depth, without the\nneed for additional networks or empirical assumptions. In experiments we show\nthat our method significantly enhances performance on RGB and depth images in\nthe comprehensive dataset, demonstrating the reliability of the Bayesian NeRF\napproach to quantifying uncertainty based on the geometric structure.\n","authors":["Sibeak Lee","Kyeongsu Kang","Hyeonwoo Yu"],"pdf_url":"https://arxiv.org/pdf/2404.06727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03384v2","updated":"2024-04-10T04:24:36Z","published":"2024-04-04T11:33:29Z","title":"LongVLM: Efficient Long Video Understanding via Large Language Models","summary":" Empowered by Large Language Models (LLMs), recent advancements in VideoLLMs\nhave driven progress in various video understanding tasks. These models encode\nvideo representations through pooling or query aggregation over a vast number\nof visual tokens, making computational and memory costs affordable. Despite\nsuccessfully providing an overall comprehension of video content, existing\nVideoLLMs still face challenges in achieving detailed understanding in videos\ndue to overlooking local information in long-term videos. To tackle this\nchallenge, we introduce LongVLM, a straightforward yet powerful VideoLLM for\nlong video understanding, building upon the observation that long videos often\nconsist of sequential key events, complex actions, and camera movements. Our\napproach proposes to decompose long videos into multiple short-term segments\nand encode local features for each local segment via a hierarchical token\nmerging module. These features are concatenated in temporal order to maintain\nthe storyline across sequential short-term segments. Additionally, we propose\nto integrate global semantics into each local feature to enhance context\nunderstanding. In this way, we encode video representations that incorporate\nboth local and global information, enabling the LLM to generate comprehensive\nresponses for long-term videos. Experimental results on the VideoChatGPT\nbenchmark and zero-shot video question-answering datasets demonstrate the\nsuperior capabilities of our model over the previous state-of-the-art methods.\nQualitative examples demonstrate that our model produces more precise responses\nfor long videos understanding. Code will be available at\nhttps://github.com/ziplab/LongVLM.\n","authors":["Yuetian Weng","Mingfei Han","Haoyu He","Xiaojun Chang","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2404.03384v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11848v2","updated":"2024-04-10T04:05:24Z","published":"2024-03-18T15:00:38Z","title":"GraphBEV: Towards Robust BEV Feature Alignment for Multi-Modal 3D Object\n Detection","summary":" Integrating LiDAR and camera information into Bird's-Eye-View (BEV)\nrepresentation has emerged as a crucial aspect of 3D object detection in\nautonomous driving. However, existing methods are susceptible to the inaccurate\ncalibration relationship between LiDAR and the camera sensor. Such inaccuracies\nresult in errors in depth estimation for the camera branch, ultimately causing\nmisalignment between LiDAR and camera BEV features. In this work, we propose a\nrobust fusion framework called Graph BEV. Addressing errors caused by\ninaccurate point cloud projection, we introduce a Local Align module that\nemploys neighbor-aware depth features via Graph matching. Additionally, we\npropose a Global Align module to rectify the misalignment between LiDAR and\ncamera BEV features. Our Graph BEV framework achieves state-of-the-art\nperformance, with an mAP of 70.1\\%, surpassing BEV Fusion by 1.6\\% on the\nnuscenes validation set. Importantly, our Graph BEV outperforms BEV Fusion by\n8.3\\% under conditions with misalignment noise.\n","authors":["Ziying Song","Lei Yang","Shaoqing Xu","Lin Liu","Dongyang Xu","Caiyan Jia","Feiyang Jia","Li Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06194v2","updated":"2024-04-10T04:01:43Z","published":"2024-04-09T10:27:22Z","title":"Exploring the Potential of Large Foundation Models for Open-Vocabulary\n HOI Detection","summary":" Open-vocabulary human-object interaction (HOI) detection, which is concerned\nwith the problem of detecting novel HOIs guided by natural language, is crucial\nfor understanding human-centric scenes. However, prior zero-shot HOI detectors\noften employ the same levels of feature maps to model HOIs with varying\ndistances, leading to suboptimal performance in scenes containing human-object\npairs with a wide range of distances. In addition, these detectors primarily\nrely on category names and overlook the rich contextual information that\nlanguage can provide, which is essential for capturing open vocabulary concepts\nthat are typically rare and not well-represented by category names alone. In\nthis paper, we introduce a novel end-to-end open vocabulary HOI detection\nframework with conditional multi-level decoding and fine-grained semantic\nenhancement (CMD-SE), harnessing the potential of Visual-Language Models\n(VLMs). Specifically, we propose to model human-object pairs with different\ndistances with different levels of feature maps by incorporating a soft\nconstraint during the bipartite matching process. Furthermore, by leveraging\nlarge language models (LLMs) such as GPT models, we exploit their extensive\nworld knowledge to generate descriptions of human body part states for various\ninteractions. Then we integrate the generalizable and fine-grained semantics of\nhuman body parts to improve interaction recognition. Experimental results on\ntwo datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method\nachieves state-of-the-art results in open vocabulary HOI detection. The code\nand models are available at https://github.com/ltttpku/CMD-SE-release.\n","authors":["Ting Lei","Shaofeng Yin","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06194v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06715v1","updated":"2024-04-10T03:54:53Z","published":"2024-04-10T03:54:53Z","title":"Sparse Points to Dense Clouds: Enhancing 3D Detection with Limited LiDAR\n Data","summary":" 3D detection is a critical task that enables machines to identify and locate\nobjects in three-dimensional space. It has a broad range of applications in\nseveral fields, including autonomous driving, robotics and augmented reality.\nMonocular 3D detection is attractive as it requires only a single camera,\nhowever, it lacks the accuracy and robustness required for real world\napplications. High resolution LiDAR on the other hand, can be expensive and\nlead to interference problems in heavy traffic given their active\ntransmissions. We propose a balanced approach that combines the advantages of\nmonocular and point cloud-based 3D detection. Our method requires only a small\nnumber of 3D points, that can be obtained from a low-cost, low-resolution\nsensor. Specifically, we use only 512 points, which is just 1% of a full LiDAR\nframe in the KITTI dataset. Our method reconstructs a complete 3D point cloud\nfrom this limited 3D information combined with a single image. The\nreconstructed 3D point cloud and corresponding image can be used by any\nmulti-modal off-the-shelf detector for 3D object detection. By using the\nproposed network architecture with an off-the-shelf multi-modal 3D detector,\nthe accuracy of 3D detection improves by 20% compared to the state-of-the-art\nmonocular detection methods and 6% to 9% compare to the baseline multi-modal\nmethods on KITTI and JackRabbot datasets.\n","authors":["Aakash Kumar","Chen Chen","Ajmal Mian","Neils Lobo","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2404.06715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01929v2","updated":"2024-04-10T03:36:33Z","published":"2024-04-02T13:23:21Z","title":"Towards Enhanced Analysis of Lung Cancer Lesions in EBUS-TBNA -- A\n Semi-Supervised Video Object Detection Method","summary":" This study aims to establish a computer-aided diagnostic system for lung\nlesions using bronchoscope endobronchial ultrasound (EBUS) to assist physicians\nin identifying lesion areas. During EBUS-transbronchial needle aspiration\n(EBUS-TBNA) procedures, physicians rely on grayscale ultrasound images to\ndetermine the location of lesions. However, these images often contain\nsignificant noise and can be influenced by surrounding tissues or blood\nvessels, making interpretation challenging. Previous research has lacked the\napplication of object detection models to EBUS-TBNA, and there has been no\nwell-defined solution for annotating the EBUS-TBNA dataset. In related studies\non ultrasound images, although models have been successful in capturing target\nregions for their respective tasks, their training and predictions have been\nbased on two-dimensional images, limiting their ability to leverage temporal\nfeatures for improved predictions. This study introduces a three-dimensional\nimage-based object detection model. It utilizes an attention mechanism to\ncapture temporal correlations and we will implements a filtering mechanism to\nselect relevant information from previous frames. Subsequently, a\nteacher-student model training approach is employed to optimize the model\nfurther, leveraging unlabeled data. To mitigate the impact of poor-quality\npseudo-labels on the student model, we will add a special Gaussian Mixture\nModel (GMM) to ensure the quality of pseudo-labels.\n","authors":["Jyun-An Lin","Yun-Chien Cheng","Ching-Kai Lin"],"pdf_url":"https://arxiv.org/pdf/2404.01929v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06080v2","updated":"2024-04-10T03:35:35Z","published":"2024-04-09T07:39:21Z","title":"Using Few-Shot Learning to Classify Primary Lung Cancer and Other\n Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial\n Ultrasound Procedures","summary":" This study aims to establish a computer-aided diagnosis system for\nendobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary\ndiagnosis of metastatic cancer. This involves arranging immediate examinations\nfor other sites of metastatic cancer after EBUS surgery, eliminating the need\nto wait for reports, thereby shortening the waiting time by more than half and\nenabling patients to detect other cancers earlier, allowing for early planning\nand implementation of treatment plans. Unlike previous studies on cell image\nclassification, which have abundant datasets for training, this study must also\nbe able to make effective classifications despite the limited amount of case\ndata for lung metastatic cancer. In the realm of small data set classification\nmethods, Few-shot learning (FSL) has become mainstream in recent years. Through\nits ability to train on small datasets and its strong generalization\ncapabilities, FSL shows potential in this task of lung metastatic cell image\nclassification. This study will adopt the approach of Few-shot learning,\nreferencing existing proposed models, and designing a model architecture for\nclassifying lung metastases cell images. Batch Spectral Regularization (BSR)\nwill be incorporated as a loss update parameter, and the Finetune method of PMF\nwill be modified. In terms of test results, the addition of BSR and the\nmodified Finetune method further increases the accuracy by 8.89% to 65.60%,\noutperforming other FSL methods. This study confirms that FSL is superior to\nsupervised and transfer learning in classifying metastatic cancer and\ndemonstrates that using BSR as a loss function and modifying Finetune can\nenhance the model's capabilities.\n","authors":["Ching-Kai Lin","Di-Chun Wei","Yun-Chien Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.06080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06710v1","updated":"2024-04-10T03:31:32Z","published":"2024-04-10T03:31:32Z","title":"SpikeNVS: Enhancing Novel View Synthesis from Blurry Images via Spike\n Camera","summary":" One of the most critical factors in achieving sharp Novel View Synthesis\n(NVS) using neural field methods like Neural Radiance Fields (NeRF) and 3D\nGaussian Splatting (3DGS) is the quality of the training images. However,\nConventional RGB cameras are susceptible to motion blur. In contrast,\nneuromorphic cameras like event and spike cameras inherently capture more\ncomprehensive temporal information, which can provide a sharp representation of\nthe scene as additional training data. Recent methods have explored the\nintegration of event cameras to improve the quality of NVS. The event-RGB\napproaches have some limitations, such as high training costs and the inability\nto work effectively in the background. Instead, our study introduces a new\nmethod that uses the spike camera to overcome these limitations. By considering\ntexture reconstruction from spike streams as ground truth, we design the\nTexture from Spike (TfS) loss. Since the spike camera relies on temporal\nintegration instead of temporal differentiation used by event cameras, our\nproposed TfS loss maintains manageable training costs. It handles foreground\nobjects with backgrounds simultaneously. We also provide a real-world dataset\ncaptured with our spike-RGB camera system to facilitate future research\nendeavors. We conduct extensive experiments using synthetic and real-world\ndatasets to demonstrate that our design can enhance novel view synthesis across\nNeRF and 3DGS. The code and dataset will be made available for public access.\n","authors":["Gaole Dai","Zhenyu Wang","Qinwen Xu","Wen Cheng","Ming Lu","Boxing Shi","Shanghang Zhang","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2404.06710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06038v2","updated":"2024-04-10T03:27:04Z","published":"2023-07-12T09:33:21Z","title":"Pyramid Deep Fusion Network for Two-Hand Reconstruction from RGB-D\n Images","summary":" Accurately recovering the dense 3D mesh of both hands from monocular images\nposes considerable challenges due to occlusions and projection ambiguity. Most\nof the existing methods extract features from color images to estimate the\nroot-aligned hand meshes, which neglect the crucial depth and scale information\nin the real world. Given the noisy sensor measurements with limited resolution,\ndepth-based methods predict 3D keypoints rather than a dense mesh. These\nlimitations motivate us to take advantage of these two complementary inputs to\nacquire dense hand meshes on a real-world scale. In this work, we propose an\nend-to-end framework for recovering dense meshes for both hands, which employ\nsingle-view RGB-D image pairs as input. The primary challenge lies in\neffectively utilizing two different input modalities to mitigate the blurring\neffects in RGB images and noises in depth images. Instead of directly treating\ndepth maps as additional channels for RGB images, we encode the depth\ninformation into the unordered point cloud to preserve more geometric details.\nSpecifically, our framework employs ResNet50 and PointNet++ to derive features\nfrom RGB and point cloud, respectively. Additionally, we introduce a novel\npyramid deep fusion network (PDFNet) to aggregate features at different scales,\nwhich demonstrates superior efficacy compared to previous fusion strategies.\nFurthermore, we employ a GCN-based decoder to process the fused features and\nrecover the corresponding 3D pose and dense mesh. Through comprehensive\nablation experiments, we have not only demonstrated the effectiveness of our\nproposed fusion algorithm but also outperformed the state-of-the-art approaches\non publicly available datasets. To reproduce the results, we will make our\nsource code and models publicly available at\n{https://github.com/zijinxuxu/PDFNet}.\n","authors":["Jinwei Ren","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.06038v2.pdf","comment":"Accepted by TCSVT"},{"id":"http://arxiv.org/abs/2404.06704v1","updated":"2024-04-10T03:20:33Z","published":"2024-04-10T03:20:33Z","title":"Convolution-based Probability Gradient Loss for Semantic Segmentation","summary":" In this paper, we introduce a novel Convolution-based Probability Gradient\n(CPG) loss for semantic segmentation. It employs convolution kernels similar to\nthe Sobel operator, capable of computing the gradient of pixel intensity in an\nimage. This enables the computation of gradients for both ground-truth and\npredicted category-wise probabilities. It enhances network performance by\nmaximizing the similarity between these two probability gradients. Moreover, to\nspecifically enhance accuracy near the object's boundary, we extract the object\nboundary based on the ground-truth probability gradient and exclusively apply\nthe CPG loss to pixels belonging to boundaries. CPG loss proves to be highly\nconvenient and effective. It establishes pixel relationships through\nconvolution, calculating errors from a distinct dimension compared to\npixel-wise loss functions such as cross-entropy loss. We conduct qualitative\nand quantitative analyses to evaluate the impact of the CPG loss on three\nwell-established networks (DeepLabv3-Resnet50, HRNetV2-OCR, and\nLRASPP_MobileNet_V3_Large) across three standard segmentation datasets\n(Cityscapes, COCO-Stuff, ADE20K). Our extensive experimental results\nconsistently and significantly demonstrate that the CPG loss enhances the mean\nIntersection over Union.\n","authors":["Guohang Shan","Shuangcheng Jia"],"pdf_url":"https://arxiv.org/pdf/2404.06704v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.06700v1","updated":"2024-04-10T03:11:10Z","published":"2024-04-10T03:11:10Z","title":"Scaling Multi-Camera 3D Object Detection through Weak-to-Strong\n Eliciting","summary":" The emergence of Multi-Camera 3D Object Detection (MC3D-Det), facilitated by\nbird's-eye view (BEV) representation, signifies a notable progression in 3D\nobject detection. Scaling MC3D-Det training effectively accommodates varied\ncamera parameters and urban landscapes, paving the way for the MC3D-Det\nfoundation model. However, the multi-view fusion stage of the MC3D-Det method\nrelies on the ill-posed monocular perception during training rather than\nsurround refinement ability, leading to what we term \"surround refinement\ndegradation\". To this end, our study presents a weak-to-strong eliciting\nframework aimed at enhancing surround refinement while maintaining robust\nmonocular perception. Specifically, our framework employs weakly tuned experts\ntrained on distinct subsets, and each is inherently biased toward specific\ncamera configurations and scenarios. These biased experts can learn the\nperception of monocular degeneration, which can help the multi-view fusion\nstage to enhance surround refinement abilities. Moreover, a composite\ndistillation strategy is proposed to integrate the universal knowledge of 2D\nfoundation models and task-specific information. Finally, for MC3D-Det joint\ntraining, the elaborate dataset merge strategy is designed to solve the problem\nof inconsistent camera numbers and camera parameters. We set up a multiple\ndataset joint training benchmark for MC3D-Det and adequately evaluated existing\nmethods. Further, we demonstrate the proposed framework brings a generalized\nand significant boost over multiple baselines. Our code is at\n\\url{https://github.com/EnVision-Research/Scale-BEV}.\n","authors":["Hao Lu","Jiaqi Tang","Xinli Xu","Xu Cao","Yunpeng Zhang","Guoqing Wang","Dalong Du","Hao Chen","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05645v2","updated":"2024-04-10T03:05:04Z","published":"2023-09-11T17:37:08Z","title":"CitDet: A Benchmark Dataset for Citrus Fruit Detection","summary":" In this letter, we present a new dataset to advance the state of the art in\ndetecting citrus fruit and accurately estimate yield on trees affected by the\nHuanglongbing (HLB) disease in orchard environments via imaging. Despite the\nfact that significant progress has been made in solving the fruit detection\nproblem, the lack of publicly available datasets has complicated direct\ncomparison of results. For instance, citrus detection has long been of interest\nto the agricultural research community, yet there is an absence of work,\nparticularly involving public datasets of citrus affected by HLB. To address\nthis issue, we enhance state-of-the-art object detection methods for use in\ntypical orchard settings. Concretely, we provide high-resolution images of\ncitrus trees located in an area known to be highly affected by HLB, along with\nhigh-quality bounding box annotations of citrus fruit. Fruit on both the trees\nand the ground are labeled to allow for identification of fruit location, which\ncontributes to advancements in yield estimation and potential measure of HLB\nimpact via fruit drop. The dataset consists of over 32,000 bounding box\nannotations for fruit instances contained in 579 high-resolution images. In\nsummary, our contributions are the following: (i) we introduce a novel dataset\nalong with baseline performance benchmarks on multiple contemporary object\ndetection algorithms, (ii) we show the ability to accurately capture fruit\nlocation on tree or on ground, and finally (ii) we present a correlation of our\nresults with yield estimations.\n","authors":["Jordan A. James","Heather K. Manching","Matthew R. Mattia","Kim D. Bowman","Amanda M. Hulse-Kemp","William J. Beksi"],"pdf_url":"https://arxiv.org/pdf/2309.05645v2.pdf","comment":"Submitted to IEEE Robotics and Automation Letters (RA-L)"},{"id":"http://arxiv.org/abs/2404.06693v1","updated":"2024-04-10T02:47:05Z","published":"2024-04-10T02:47:05Z","title":"Binomial Self-compensation for Motion Error in Dynamic 3D Scanning","summary":" Phase shifting profilometry (PSP) is favored in high-precision 3D scanning\ndue to its high accuracy, robustness, and pixel-wise property. However, a\nfundamental assumption of PSP that the object should remain static is violated\nin dynamic measurement, making PSP susceptible to object moving, resulting in\nripple-like errors in the point clouds. We propose a pixel-wise and frame-wise\nloopable binomial self-compensation (BSC) algorithm to effectively and flexibly\neliminate motion error in the four-step PSP. Our mathematical model\ndemonstrates that by summing successive motion-affected phase frames weighted\nby binomial coefficients, motion error exponentially diminishes as the binomial\norder increases, accomplishing automatic error compensation through the\nmotion-affected phase sequence, without the assistance of any intermediate\nvariable. Extensive experiments show that our BSC outperforms the existing\nmethods in reducing motion error, while achieving a depth map frame rate equal\nto the camera's acquisition rate (90 fps), enabling high-accuracy 3D\nreconstruction with a quasi-single-shot frame rate.\n","authors":["Geyou Zhang","Ce Zhu","Kai Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06692v1","updated":"2024-04-10T02:40:17Z","published":"2024-04-10T02:40:17Z","title":"Perception-Oriented Video Frame Interpolation via Asymmetric Blending","summary":" Previous methods for Video Frame Interpolation (VFI) have encountered\nchallenges, notably the manifestation of blur and ghosting effects. These\nissues can be traced back to two pivotal factors: unavoidable motion errors and\nmisalignment in supervision. In practice, motion estimates often prove to be\nerror-prone, resulting in misaligned features. Furthermore, the reconstruction\nloss tends to bring blurry results, particularly in misaligned regions. To\nmitigate these challenges, we propose a new paradigm called PerVFI\n(Perception-oriented Video Frame Interpolation). Our approach incorporates an\nAsymmetric Synergistic Blending module (ASB) that utilizes features from both\nsides to synergistically blend intermediate features. One reference frame\nemphasizes primary content, while the other contributes complementary\ninformation. To impose a stringent constraint on the blending process, we\nintroduce a self-learned sparse quasi-binary mask which effectively mitigates\nghosting and blur artifacts in the output. Additionally, we employ a\nnormalizing flow-based generator and utilize the negative log-likelihood loss\nto learn the conditional distribution of the output, which further facilitates\nthe generation of clear and fine details. Experimental results validate the\nsuperiority of PerVFI, demonstrating significant improvements in perceptual\nquality compared to existing methods. Codes are available at\n\\url{https://github.com/mulns/PerVFI}\n","authors":["Guangyang Wu","Xin Tao","Changlin Li","Wenyi Wang","Xiaohong Liu","Qingqing Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.06692v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2210.16101v2","updated":"2024-04-10T02:33:57Z","published":"2022-10-27T13:24:08Z","title":"A Generic Shared Attention Mechanism for Various Backbone Neural\n Networks","summary":" The self-attention mechanism has emerged as a critical component for\nimproving the performance of various backbone neural networks. However, current\nmainstream approaches individually incorporate newly designed self-attention\nmodules (SAMs) into each layer of the network for granted without fully\nexploiting their parameters' potential. This leads to suboptimal performance\nand increased parameter consumption as the network depth increases. To improve\nthis paradigm, in this paper, we first present a counterintuitive but inherent\nphenomenon: SAMs tend to produce strongly correlated attention maps across\ndifferent layers, with an average Pearson correlation coefficient of up to\n0.85. Inspired by this inherent observation, we propose Dense-and-Implicit\nAttention (DIA), which directly shares SAMs across layers and employs a long\nshort-term memory module to calibrate and bridge the highly correlated\nattention maps of different layers, thus improving the parameter utilization\nefficiency of SAMs. This design of DIA is also consistent with the neural\nnetwork's dynamical system perspective. Through extensive experiments, we\ndemonstrate that our simple yet effective DIA can consistently enhance various\nnetwork backbones, including ResNet, Transformer, and UNet, across tasks such\nas image classification, object detection, and image generation using diffusion\nmodels.\n","authors":["Zhongzhan Huang","Senwei Liang","Mingfu Liang","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2210.16101v2.pdf","comment":"Work in progress. arXiv admin note: text overlap with\n arXiv:1905.10671"},{"id":"http://arxiv.org/abs/2404.06493v2","updated":"2024-04-10T02:24:58Z","published":"2024-04-09T17:48:52Z","title":"Flying with Photons: Rendering Novel Views of Propagating Light","summary":" We present an imaging and neural rendering technique that seeks to synthesize\nvideos of light propagating through a scene from novel, moving camera\nviewpoints. Our approach relies on a new ultrafast imaging setup to capture a\nfirst-of-its kind, multi-viewpoint video dataset with picosecond-level temporal\nresolution. Combined with this dataset, we introduce an efficient neural volume\nrendering framework based on the transient field. This field is defined as a\nmapping from a 3D point and 2D direction to a high-dimensional, discrete-time\nsignal that represents time-varying radiance at ultrafast timescales. Rendering\nwith transient fields naturally accounts for effects due to the finite speed of\nlight, including viewpoint-dependent appearance changes caused by light\npropagation delays to the camera. We render a range of complex effects,\nincluding scattering, specular reflection, refraction, and diffraction.\nAdditionally, we demonstrate removing viewpoint-dependent propagation delays\nusing a time warping procedure, rendering of relativistic effects, and video\nsynthesis of direct and global components of light transport.\n","authors":["Anagh Malik","Noah Juravsky","Ryan Po","Gordon Wetzstein","Kiriakos N. Kutulakos","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2404.06493v2.pdf","comment":"Project page: https://anaghmalik.com/FlyingWithPhotons/"},{"id":"http://arxiv.org/abs/2404.06507v2","updated":"2024-04-10T02:23:09Z","published":"2024-04-09T17:55:41Z","title":"Reconstructing Hand-Held Objects in 3D","summary":" Objects manipulated by the hand (i.e., manipulanda) are particularly\nchallenging to reconstruct from in-the-wild RGB images or videos. Not only does\nthe hand occlude much of the object, but also the object is often only visible\nin a small number of image pixels. At the same time, two strong anchors emerge\nin this setting: (1) estimated 3D hands help disambiguate the location and\nscale of the object, and (2) the set of manipulanda is small relative to all\npossible objects. With these insights in mind, we present a scalable paradigm\nfor handheld object reconstruction that builds on recent breakthroughs in large\nlanguage/vision models and 3D object datasets. Our model, MCC-Hand-Object\n(MCC-HO), jointly reconstructs hand and object geometry given a single RGB\nimage and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve\na 3D object model that matches the object in the image and rigidly align the\nmodel to the network-inferred geometry; we call this alignment\nRetrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO\nachieves state-of-the-art performance on lab and Internet datasets, and we show\nhow RAR can be used to automatically obtain 3D labels for in-the-wild images of\nhand-object interactions.\n","authors":["Jane Wu","Georgios Pavlakos","Georgia Gkioxari","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2404.06507v2.pdf","comment":"Project page: https://janehwu.github.io/mcc-ho"},{"id":"http://arxiv.org/abs/2311.10568v2","updated":"2024-04-10T02:19:19Z","published":"2023-11-17T15:08:15Z","title":"Phase Guided Light Field for Spatial-Depth High Resolution 3D Imaging","summary":" On 3D imaging, light field cameras typically are of single shot, and however,\nthey heavily suffer from low spatial resolution and depth accuracy. In this\npaper, by employing an optical projector to project a group of single\nhigh-frequency phase-shifted sinusoid patterns, we propose a phase guided light\nfield algorithm to significantly improve both the spatial and depth resolutions\nfor off-the-shelf light field cameras. First, for correcting the axial\naberrations caused by the main lens of our light field camera, we propose a\ndeformed cone model to calibrate our structured light field system. Second,\nover wrapped phases computed from patterned images, we propose a stereo\nmatching algorithm, i.e. phase guided sum of absolute difference, to robustly\nobtain the correspondence for each pair of neighbored two lenslets. Finally, by\nintroducing a virtual camera according to the basic geometrical optics of light\nfield imaging, we propose a reorganization strategy to reconstruct 3D point\nclouds with spatial-depth high resolution. Experimental results show that,\ncompared with the state-of-the-art active light field methods, the proposed\nreconstructs 3D point clouds with a spatial resolution of 1280$\\times$720 with\nfactors 10$\\times$ increased, while maintaining the same high depth resolution\nand needing merely a single group of high-frequency patterns.\n","authors":["Geyou Zhang","Ce Zhu","Kai Liu","Yipeng Liu"],"pdf_url":"https://arxiv.org/pdf/2311.10568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06479v2","updated":"2024-04-10T02:12:27Z","published":"2024-04-09T17:30:18Z","title":"Text-Based Reasoning About Vector Graphics","summary":" While large multimodal models excel in broad vision-language benchmarks, they\noften struggle with tasks requiring precise perception of low-level visual\ndetails, such as comparing line lengths or solving simple mazes. In particular,\nthis failure mode persists in question-answering tasks about vector graphics --\nimages composed purely of 2D objects and shapes. To address this challenge, we\npropose the Visually Descriptive Language Model (VDLM), which performs\ntext-based reasoning about vector graphics. VDLM leverages Scalable Vector\nGraphics (SVG) for a more precise visual description and first uses an\noff-the-shelf raster-to-SVG algorithm for encoding. Since existing language\nmodels cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG\nwith pretrained language models through a newly introduced intermediate\nsymbolic representation, Primal Visual Description (PVD), comprising primitive\nattributes (e.g., shape, position, measurement) with their corresponding\npredicted values. PVD is task-agnostic and represents visual primitives that\nare universal across all vector graphics. It can be learned with procedurally\ngenerated (SVG, PVD) pairs and also enables the direct use of LLMs for\ngeneralization to complex reasoning tasks. By casting an image to a text-based\nrepresentation, we can leverage the power of language models to learn alignment\nfrom SVG to visual primitives and generalize to unseen question-answering\ntasks. Empirical results show that VDLM achieves stronger zero-shot performance\ncompared to state-of-the-art LMMs, such as GPT-4V, in various low-level\nmultimodal perception and reasoning tasks on vector graphics. We additionally\npresent extensive analyses on VDLM's performance, demonstrating that our\nframework offers better interpretability due to its disentangled perception and\nreasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/\n","authors":["Zhenhailong Wang","Joy Hsu","Xingyao Wang","Kuan-Hao Huang","Manling Li","Jiajun Wu","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2404.06479v2.pdf","comment":"Project page: https://mikewangwzhl.github.io/VDLM/"},{"id":"http://arxiv.org/abs/2404.06683v1","updated":"2024-04-10T02:03:14Z","published":"2024-04-10T02:03:14Z","title":"Unsupervised Visible-Infrared ReID via Pseudo-label Correction and\n Modality-level Alignment","summary":" Unsupervised visible-infrared person re-identification (UVI-ReID) has\nrecently gained great attention due to its potential for enhancing human\ndetection in diverse environments without labeling. Previous methods utilize\nintra-modality clustering and cross-modality feature matching to achieve\nUVI-ReID. However, there exist two challenges: 1) noisy pseudo labels might be\ngenerated in the clustering process, and 2) the cross-modality feature\nalignment via matching the marginal distribution of visible and infrared\nmodalities may misalign the different identities from two modalities. In this\npaper, we first conduct a theoretic analysis where an interpretable\ngeneralization upper bound is introduced. Based on the analysis, we then\npropose a novel unsupervised cross-modality person re-identification framework\n(PRAISE). Specifically, to address the first challenge, we propose a\npseudo-label correction strategy that utilizes a Beta Mixture Model to predict\nthe probability of mis-clustering based network's memory effect and rectifies\nthe correspondence by adding a perceptual term to contrastive learning. Next,\nwe introduce a modality-level alignment strategy that generates paired\nvisible-infrared latent features and reduces the modality gap by aligning the\nlabeling function of visible and infrared features to learn identity\ndiscriminative and modality-invariant features. Experimental results on two\nbenchmark datasets demonstrate that our method achieves state-of-the-art\nperformance than the unsupervised visible-ReID methods.\n","authors":["Yexin Liu","Weiming Zhang","Athanasios V. Vasilakos","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06683v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.02065v2","updated":"2024-04-10T01:53:17Z","published":"2024-04-02T16:06:20Z","title":"Multi-Level Label Correction by Distilling Proximate Patterns for\n Semi-supervised Semantic Segmentation","summary":" Semi-supervised semantic segmentation relieves the reliance on large-scale\nlabeled data by leveraging unlabeled data. Recent semi-supervised semantic\nsegmentation approaches mainly resort to pseudo-labeling methods to exploit\nunlabeled data. However, unreliable pseudo-labeling can undermine the\nsemi-supervision processes. In this paper, we propose an algorithm called\nMulti-Level Label Correction (MLLC), which aims to use graph neural networks to\ncapture structural relationships in Semantic-Level Graphs (SLGs) and\nClass-Level Graphs (CLGs) to rectify erroneous pseudo-labels. Specifically,\nSLGs represent semantic affinities between pairs of pixel features, and CLGs\ndescribe classification consistencies between pairs of pixel labels. With the\nsupport of proximate pattern information from graphs, MLLC can rectify\nincorrectly predicted pseudo-labels and can facilitate discriminative feature\nrepresentations. We design an end-to-end network to train and perform this\neffective label corrections mechanism. Experiments demonstrate that MLLC can\nsignificantly improve supervised baselines and outperforms state-of-the-art\napproaches in different scenarios on Cityscapes and PASCAL VOC 2012 datasets.\nSpecifically, MLLC improves the supervised baseline by at least 5% and 2% with\nDeepLabV2 and DeepLabV3+ respectively under different partition protocols.\n","authors":["Hui Xiao","Yuting Hong","Li Dong","Diqun Yan","Jiayan Zhuang","Junjie Xiong","Dongtai Liang","Chengbin Peng"],"pdf_url":"https://arxiv.org/pdf/2404.02065v2.pdf","comment":"12 pages, 8 figures. IEEE Transactions on Multimedia, 2024"},{"id":"http://arxiv.org/abs/2301.04218v4","updated":"2024-04-10T01:11:15Z","published":"2023-01-10T21:50:26Z","title":"Leveraging Diffusion For Strong and High Quality Face Morphing Attacks","summary":" Face morphing attacks seek to deceive a Face Recognition (FR) system by\npresenting a morphed image consisting of the biometric qualities from two\ndifferent identities with the aim of triggering a false acceptance with one of\nthe two identities, thereby presenting a significant threat to biometric\nsystems. The success of a morphing attack is dependent on the ability of the\nmorphed image to represent the biometric characteristics of both identities\nthat were used to create the image. We present a novel morphing attack that\nuses a Diffusion-based architecture to improve the visual fidelity of the image\nand the ability of the morphing attack to represent characteristics from both\nidentities. We demonstrate the effectiveness of the proposed attack by\nevaluating its visual fidelity via the Frechet Inception Distance (FID). Also,\nextensive experiments are conducted to measure the vulnerability of FR systems\nto the proposed attack. The ability of a morphing attack detector to detect the\nproposed attack is measured and compared against two state-of-the-art GAN-based\nmorphing attacks along with two Landmark-based attacks. Additionally, a novel\nmetric to measure the relative strength between different morphing attacks is\nintroduced and evaluated.\n","authors":["Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2301.04218v4.pdf","comment":"Diffusion Morphs (DiM) paper. Accepted in IEEE TBIOM"},{"id":"http://arxiv.org/abs/2404.05215v2","updated":"2024-04-10T00:49:11Z","published":"2024-04-08T06:07:32Z","title":"Spatio-Temporal Attention and Gaussian Processes for Personalized Video\n Gaze Estimation","summary":" Gaze is an essential prompt for analyzing human behavior and attention.\nRecently, there has been an increasing interest in determining gaze direction\nfrom facial videos. However, video gaze estimation faces significant\nchallenges, such as understanding the dynamic evolution of gaze in video\nsequences, dealing with static backgrounds, and adapting to variations in\nillumination. To address these challenges, we propose a simple and novel deep\nlearning model designed to estimate gaze from videos, incorporating a\nspecialized attention module. Our method employs a spatial attention mechanism\nthat tracks spatial dynamics within videos. This technique enables accurate\ngaze direction prediction through a temporal sequence model, adeptly\ntransforming spatial observations into temporal insights, thereby significantly\nimproving gaze estimation accuracy. Additionally, our approach integrates\nGaussian processes to include individual-specific traits, facilitating the\npersonalization of our model with just a few labeled samples. Experimental\nresults confirm the efficacy of the proposed approach, demonstrating its\nsuccess in both within-dataset and cross-dataset settings. Specifically, our\nproposed approach achieves state-of-the-art performance on the Gaze360 dataset,\nimproving by $2.5^\\circ$ without personalization. Further, by personalizing the\nmodel with just three samples, we achieved an additional improvement of\n$0.8^\\circ$. The code and pre-trained models are available at\n\\url{https://github.com/jswati31/stage}.\n","authors":["Swati Jindal","Mohit Yadav","Roberto Manduchi"],"pdf_url":"https://arxiv.org/pdf/2404.05215v2.pdf","comment":"Accepted at CVPR 2024 Gaze workshop"},{"id":"http://arxiv.org/abs/2404.06666v1","updated":"2024-04-10T00:26:08Z","published":"2024-04-10T00:26:08Z","title":"SafeGen: Mitigating Unsafe Content Generation in Text-to-Image Models","summary":" Text-to-image (T2I) models, such as Stable Diffusion, have exhibited\nremarkable performance in generating high-quality images from text descriptions\nin recent years. However, text-to-image models may be tricked into generating\nnot-safe-for-work (NSFW) content, particularly in sexual scenarios. Existing\ncountermeasures mostly focus on filtering inappropriate inputs and outputs, or\nsuppressing improper text embeddings, which can block explicit NSFW-related\ncontent (e.g., naked or sexy) but may still be vulnerable to adversarial\nprompts inputs that appear innocent but are ill-intended. In this paper, we\npresent SafeGen, a framework to mitigate unsafe content generation by\ntext-to-image models in a text-agnostic manner. The key idea is to eliminate\nunsafe visual representations from the model regardless of the text input. In\nthis way, the text-to-image model is resistant to adversarial prompts since\nunsafe visual representations are obstructed from within. Extensive experiments\nconducted on four datasets demonstrate SafeGen's effectiveness in mitigating\nunsafe content generation while preserving the high-fidelity of benign images.\nSafeGen outperforms eight state-of-the-art baseline methods and achieves 99.1%\nsexual content removal performance. Furthermore, our constructed benchmark of\nadversarial prompts provides a basis for future development and evaluation of\nanti-NSFW-generation methods.\n","authors":["Xinfeng Li","Yuchen Yang","Jiangyi Deng","Chen Yan","Yanjiao Chen","Xiaoyu Ji","Wenyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2404.06666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06665v1","updated":"2024-04-10T00:25:09Z","published":"2024-04-10T00:25:09Z","title":"Deep Generative Data Assimilation in Multimodal Setting","summary":" Robust integration of physical knowledge and data is key to improve\ncomputational simulations, such as Earth system models. Data assimilation is\ncrucial for achieving this goal because it provides a systematic framework to\ncalibrate model outputs with observations, which can include remote sensing\nimagery and ground station measurements, with uncertainty quantification.\nConventional methods, including Kalman filters and variational approaches,\ninherently rely on simplifying linear and Gaussian assumptions, and can be\ncomputationally expensive. Nevertheless, with the rapid adoption of data-driven\nmethods in many areas of computational sciences, we see the potential of\nemulating traditional data assimilation with deep learning, especially\ngenerative models. In particular, the diffusion-based probabilistic framework\nhas large overlaps with data assimilation principles: both allows for\nconditional generation of samples with a Bayesian inverse framework. These\nmodels have shown remarkable success in text-conditioned image generation or\nimage-controlled video synthesis. Likewise, one can frame data assimilation as\nobservation-conditioned state calibration. In this work, we propose SLAMS:\nScore-based Latent Assimilation in Multimodal Setting. Specifically, we\nassimilate in-situ weather station data and ex-situ satellite imagery to\ncalibrate the vertical temperature profiles, globally. Through extensive\nablation, we demonstrate that SLAMS is robust even in low-resolution, noisy,\nand sparse data settings. To our knowledge, our work is the first to apply deep\ngenerative framework for multimodal data assimilation using real-world\ndatasets; an important step for building robust computational simulators,\nincluding the next-generation Earth system models. Our code is available at:\nhttps://github.com/yongquan-qu/SLAMS\n","authors":["Yongquan Qu","Juan Nathaniel","Shuolin Li","Pierre Gentine"],"pdf_url":"https://arxiv.org/pdf/2404.06665v1.pdf","comment":"Accepted to CVPR2024 EarthVision"},{"id":"http://arxiv.org/abs/2404.06663v1","updated":"2024-04-10T00:11:03Z","published":"2024-04-10T00:11:03Z","title":"Multi-modal Document Presentation Attack Detection With Forensics Trace\n Disentanglement","summary":" Document Presentation Attack Detection (DPAD) is an important measure in\nprotecting the authenticity of a document image. However, recent DPAD methods\ndemand additional resources, such as manual effort in collecting additional\ndata or knowing the parameters of acquisition devices. This work proposes a\nDPAD method based on multi-modal disentangled traces (MMDT) without the above\ndrawbacks. We first disentangle the recaptured traces by a self-supervised\ndisentanglement and synthesis network to enhance the generalization capacity in\ndocument images with different contents and layouts. Then, unlike the existing\nDPAD approaches that rely only on data in the RGB domain, we propose to\nexplicitly employ the disentangled recaptured traces as new modalities in the\ntransformer backbone through adaptive multi-modal adapters to fuse RGB/trace\nfeatures efficiently. Visualization of the disentangled traces confirms the\neffectiveness of the proposed method in different document contents. Extensive\nexperiments on three benchmark datasets demonstrate the superiority of our MMDT\nmethod on representing forensic traces of recapturing distortion.\n","authors":["Changsheng Chen","Yongyi Deng","Liangwei Lin","Zitong Yu","Zhimao Lai"],"pdf_url":"https://arxiv.org/pdf/2404.06663v1.pdf","comment":"Accepted to ICME 2024"},{"id":"http://arxiv.org/abs/2404.06661v1","updated":"2024-04-10T00:05:55Z","published":"2024-04-10T00:05:55Z","title":"Efficient Denoising using Score Embedding in Score-based Diffusion\n Models","summary":" It is well known that training a denoising score-based diffusion models\nrequires tens of thousands of epochs and a substantial number of image data to\ntrain the model. In this paper, we propose to increase the efficiency in\ntraining score-based diffusion models. Our method allows us to decrease the\nnumber of epochs needed to train the diffusion model. We accomplish this by\nsolving the log-density Fokker-Planck (FP) Equation numerically to compute the\nscore \\textit{before} training. The pre-computed score is embedded into the\nimage to encourage faster training under slice Wasserstein distance.\nConsequently, it also allows us to decrease the number of images we need to\ntrain the neural network to learn an accurate score. We demonstrate through our\nnumerical experiments the improved performance of our proposed method compared\nto standard score-based diffusion models. Our proposed method achieves a\nsimilar quality to the standard method meaningfully faster.\n","authors":["Andrew S. Na","William Gao","Justin W. L. Wan"],"pdf_url":"https://arxiv.org/pdf/2404.06661v1.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Robotics 33 + +
+
+
+ + ☆ Reward Learning from Suboptimal Demonstrations with Applications in + Surgical Electrocautery + + +
+ Automating robotic surgery via learning from demonstration (LfD) techniques +is extremely challenging. This is because surgical tasks often involve +sequential decision-making processes with complex interactions of physical +objects and have low tolerance for mistakes. Prior works assume that all +demonstrations are fully observable and optimal, which might not be practical +in the real world. This paper introduces a sample-efficient method that learns +a robust reward function from a limited amount of ranked suboptimal +demonstrations consisting of partial-view point cloud observations. The method +then learns a policy by optimizing the learned reward function using +reinforcement learning (RL). We show that using a learned reward function to +obtain a policy is more robust than pure imitation learning. We apply our +approach on a physical surgical electrocautery task and demonstrate that our +method can perform well even when the provided demonstrations are suboptimal +and the observations are high-dimensional point clouds. + +
+
+ comment: In proceedings of the International Symposium on Medical Robotics + (ISMR) 2024. Equal contribution from two first authors +
+
+
+
+
+ + ☆ Using Neural Networks to Model Hysteretic Kinematics in Tendon-Actuated + Continuum Robots + + +
+ The ability to accurately model mechanical hysteretic behavior in +tendon-actuated continuum robots using deep learning approaches is a growing +area of interest. In this paper, we investigate the hysteretic response of two +types of tendon-actuated continuum robots and, ultimately, compare three types +of neural network modeling approaches with both forward and inverse kinematic +mappings: feedforward neural network (FNN), FNN with a history input buffer, +and long short-term memory (LSTM) network. We seek to determine which model +best captures temporal dependent behavior. We find that, depending on the +robot's design, choosing different kinematic inputs can alter whether +hysteresis is exhibited by the system. Furthermore, we present the results of +the model fittings, revealing that, in contrast to the standard FNN, both FNN +with a history input buffer and the LSTM model exhibit the capacity to model +historical dependence with comparable performance in capturing rate-dependent +hysteresis. + +
+
+ comment: 7 pages, 8 figures, conference +
+
+
+
+
+ + ☆ CBFKIT: A Control Barrier Function Toolbox for Robotics Applications + + +
+ This paper introduces CBFKit, a Python/ROS toolbox for safe robotics planning +and control under uncertainty. The toolbox provides a general framework for +designing control barrier functions for mobility systems within both +deterministic and stochastic environments. It can be connected to the ROS +open-source robotics middleware, allowing for the setup of multi-robot +applications, encoding of environments and maps, and integrations with +predictive motion planning algorithms. Additionally, it offers multiple CBF +variations and algorithms for robot control. The CBFKit is demonstrated on the +Toyota Human Support Robot (HSR) in both simulation and in physical +experiments. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Wild Visual Navigation: Fast Traversability Learning via Pre-Trained + Models and Online Self-Supervision + + +
+ Natural environments such as forests and grasslands are challenging for +robotic navigation because of the false perception of rigid obstacles from high +grass, twigs, or bushes. In this work, we present Wild Visual Navigation (WVN), +an online self-supervised learning system for visual traversability estimation. +The system is able to continuously adapt from a short human demonstration in +the field, only using onboard sensing and computing. One of the key ideas to +achieve this is the use of high-dimensional features from pre-trained +self-supervised models, which implicitly encode semantic information that +massively simplifies the learning task. Further, the development of an online +scheme for supervision generator enables concurrent training and inference of +the learned model in the wild. We demonstrate our approach through diverse +real-world deployments in forests, parks, and grasslands. Our system is able to +bootstrap the traversable terrain segmentation in less than 5 min of in-field +training time, enabling the robot to navigate in complex, previously unseen +outdoor terrains. Code: https://bit.ly/498b0CV - Project +page:https://bit.ly/3M6nMHH + +
+
+ comment: Extended version of arXiv:2305.08510 +
+
+
+
+
+ + ☆ LaPlaSS: Latent Space Planning for Stochastic Systems + + +
+ Autonomous mobile agents often operate in hazardous environments, +necessitating an awareness of safety. These agents can have non-linear, +stochastic dynamics that must be considered during planning to guarantee +bounded risk. Most state of the art methods require closed-form dynamics to +verify plan correctness and safety however modern robotic systems often have +dynamics that are learned from data. Thus, there is a need to perform efficient +trajectory planning with guarantees on risk for agents without known dynamics +models. We propose a "generate-and-test" approach to risk-bounded planning in +which a planner generates a candidate trajectory using an approximate linear +dynamics model and a validator assesses the risk of the trajectory, computing +additional safety constraints for the planner if the candidate does not satisfy +the desired risk bound. To acquire the approximate model, we use a variational +autoencoder to learn a latent linear dynamics model and encode the planning +problem into the latent space to generate the candidate trajectory. The VAE +also serves to sample trajectories around the candidate to use in the +validator. We demonstrate that our algorithm, LaPlaSS, is able to generate +trajectory plans with bounded risk for a real-world agent with learned dynamics +and is an order of magnitude more efficient than the state of the art. + +
+
+
+
+
+ + ☆ Deep Reinforcement Learning for Mobile Robot Path Planning + + +
+ Path planning is an important problem with the the applications in many +aspects, such as video games, robotics etc. This paper proposes a novel method +to address the problem of Deep Reinforcement Learning (DRL) based path planning +for a mobile robot. We design DRL-based algorithms, including reward functions, +and parameter optimization, to avoid time-consuming work in a 2D environment. +We also designed an Two-way search hybrid A* algorithm to improve the quality +of local path planning. We transferred the designed algorithm to a simple +embedded environment to test the computational load of the algorithm when +running on a mobile robot. Experiments show that when deployed on a robot +platform, the DRL-based algorithm in this article can achieve better planning +results and consume less computing resources. + +
+
+
+
+
+ + ☆ Robotic Learning for Adaptive Informative Path Planning + + +
+ Adaptive informative path planning (AIPP) is important to many robotics +applications, enabling mobile robots to efficiently collect useful data about +initially unknown environments. In addition, learning-based methods are +increasingly used in robotics to enhance adaptability, versatility, and +robustness across diverse and complex tasks. Our survey explores research on +applying robotic learning to AIPP, bridging the gap between these two research +fields. We begin by providing a unified mathematical framework for general AIPP +problems. Next, we establish two complementary taxonomies of current work from +the perspectives of (i) learning algorithms and (ii) robotic applications. We +explore synergies, recent trends, and highlight the benefits of learning-based +methods in AIPP frameworks. Finally, we discuss key challenges and promising +future directions to enable more generally applicable and robust robotic +data-gathering systems through learning. We provide a comprehensive catalogue +of papers reviewed in our survey, including publicly available repositories, to +facilitate future studies in the field. + +
+
+ comment: 22 pages, 1 figure +
+
+
+
+
+ + Gaussian-LIC: Photo-realistic LiDAR-Inertial-Camera SLAM with 3D + Gaussian Splatting IROS 2024 + + +
+ We present a real-time LiDAR-Inertial-Camera SLAM system with 3D Gaussian +Splatting as the mapping backend. Leveraging robust pose estimates from our +LiDAR-Inertial-Camera odometry, Coco-LIC, an incremental photo-realistic +mapping system is proposed in this paper. We initialize 3D Gaussians from +colorized LiDAR points and optimize them using differentiable rendering powered +by 3D Gaussian Splatting. Meticulously designed strategies are employed to +incrementally expand the Gaussian map and adaptively control its density, +ensuring high-quality mapping with real-time capability. Experiments conducted +in diverse scenarios demonstrate the superior performance of our method +compared to existing radiance-field-based SLAM systems. + +
+
+ comment: Submitted to IROS 2024 +
+
+
+
+
+ + ☆ Vision-Language Model-based Physical Reasoning for Robot Liquid + Perception IROS 2024 + + +
+ There is a growing interest in applying large language models (LLMs) in +robotic tasks, due to their remarkable reasoning ability and extensive +knowledge learned from vast training corpora. Grounding LLMs in the physical +world remains an open challenge as they can only process textual input. Recent +advancements in large vision-language models (LVLMs) have enabled a more +comprehensive understanding of the physical world by incorporating visual +input, which provides richer contextual information than language alone. In +this work, we proposed a novel paradigm that leveraged GPT-4V(ision), the +state-of-the-art LVLM by OpenAI, to enable embodied agents to perceive liquid +objects via image-based environmental feedback. Specifically, we exploited the +physical understanding of GPT-4V to interpret the visual representation (e.g., +time-series plot) of non-visual feedback (e.g., F/T sensor data), indirectly +enabling multimodal perception beyond vision and language using images as +proxies. We evaluated our method using 10 common household liquids with +containers of various geometry and material. Without any training or +fine-tuning, we demonstrated that our method can enable the robot to indirectly +perceive the physical response of liquids and estimate their viscosity. We also +showed that by jointly reasoning over the visual and physical attributes +learned through interactions, our method could recognize liquid objects in the +absence of strong visual cues (e.g., container labels with legible text or +symbols), increasing the accuracy from 69.0% -- achieved by the best-performing +vision-only variant -- to 86.0%. + +
+
+ comment: 8 pages, 6 figures, submitted to IROS 2024 +
+
+
+
+
+ + ☆ Sound Matters: Auditory Detectability of Mobile Robots + + +
+ Mobile robots are increasingly being used in noisy environments for social +purposes, e.g. to provide support in healthcare or public spaces. Since these +robots also operate beyond human sight, the question arises as to how different +robot types, ambient noise or cognitive engagement impacts the detection of the +robots by their sound. To address this research gap, we conducted a user study +measuring auditory detection distances for a wheeled (Turtlebot 2i) and +quadruped robot (Unitree Go 1), which emit different consequential sounds when +moving. Additionally, we also manipulated background noise levels and +participants' engagement in a secondary task during the study. Our results +showed that the quadruped robot sound was detected significantly better (i.e., +at a larger distance) than the wheeled one, which demonstrates that the +movement mechanism has a meaningful impact on the auditory detectability. The +detectability for both robots diminished significantly as background noise +increased. But even in high background noise, participants detected the +quadruped robot at a significantly larger distance. The engagement in a +secondary task had hardly any impact. In essence, these findings highlight the +critical role of distinguishing auditory characteristics of different robots to +improve the smooth human-centered navigation of mobile robots in noisy +environments. + +
+
+
+
+
+ + ☆ Beyond Gait: Learning Knee Angle for Seamless Prosthesis Control in + Multiple Scenarios + + +
+ Deep learning models have become a powerful tool in knee angle estimation for +lower limb prostheses, owing to their adaptability across various gait phases +and locomotion modes. Current methods utilize Multi-Layer Perceptrons (MLP), +Long-Short Term Memory Networks (LSTM), and Convolutional Neural Networks +(CNN), predominantly analyzing motion information from the thigh. Contrary to +these approaches, our study introduces a holistic perspective by integrating +whole-body movements as inputs. We propose a transformer-based probabilistic +framework, termed the Angle Estimation Probabilistic Model (AEPM), that offers +precise angle estimations across extensive scenarios beyond walking. AEPM +achieves an overall RMSE of 6.70 degrees, with an RMSE of 3.45 degrees in +walking scenarios. Compared to the state of the art, AEPM has improved the +prediction accuracy for walking by 11.31%. Our method can achieve seamless +adaptation between different locomotion modes. Also, this model can be utilized +to analyze the synergy between the knee and other joints. We reveal that the +whole body movement has valuable information for knee movement, which can +provide insights into designing sensors for prostheses. The code is available +at https://github.com/penway/Beyond-Gait-AEPM. + +
+
+ comment: 8 pages, 6 figures, This work has been submitted to the IEEE-RAL for + possible publication +
+
+
+
+
+ + ☆ Toward Holistic Planning and Control Optimization for Dual-Arm + Rearrangement + + +
+ Long-horizon task and motion planning (TAMP) is notoriously difficult to +solve, let alone optimally, due to the tight coupling between the interleaved +(discrete) task and (continuous) motion planning phases, where each phase on +its own is frequently an NP-hard or even PSPACE-hard computational challenge. +In this study, we tackle the even more challenging goal of jointly optimizing +task and motion plans for a real dual-arm system in which the two arms operate +in close vicinity to solve highly constrained tabletop multi-object +rearrangement problems. Toward that, we construct a tightly integrated planning +and control optimization pipeline, Makespan-Optimized Dual-Arm Planner (MODAP) +that combines novel sampling techniques for task planning with state-of-the-art +trajectory optimization techniques. Compared to previous state-of-the-art, +MODAP produces task and motion plans that better coordinate a dual-arm system, +delivering significantly improved execution time improvements while +simultaneously ensuring that the resulting time-parameterized trajectory +conforms to specified acceleration and jerk limits. + +
+
+ comment: First three authors made equal contributions to this study +
+
+
+
+
+ + ☆ Designing Fluid-Exuding Cartilage for Biomimetic Robots Mimicking Human + Joint Lubrication Function + + +
+ The human joint is an open-type joint composed of bones, cartilage, +ligaments, synovial fluid, and joint capsule, having advantages of flexibility +and impact resistance. However, replicating this structure in robots introduces +friction challenges due to the absence of bearings. To address this, our study +focuses on mimicking the fluid-exuding function of human cartilage. We employ a +rubber-based 3D printing technique combined with absorbent materials to create +a versatile and easily designed cartilage sheet for biomimetic robots. We +evaluate both the fluid-exuding function and friction coefficient of the +fabricated flat cartilage sheet. Furthermore, we practically create a piece of +curved cartilage and an open-type biomimetic ball joint in combination with +bones, ligaments, synovial fluid, and joint capsule to demonstrate the utility +of the proposed cartilage sheet in the construction of such joints. + +
+
+ comment: Accepted at RoboSoft2024 +
+
+
+
+
+ + ☆ Enhancing Safety in Mixed Traffic: Learning-Based Modeling and Efficient + Control of Autonomous and Human-Driven Vehicles + + +
+ With the increasing presence of autonomous vehicles (AVs) on public roads, +developing robust control strategies to navigate the uncertainty of +human-driven vehicles (HVs) is crucial. This paper introduces an advanced +method for modeling HV behavior, combining a first-principles model with +Gaussian process (GP) learning to enhance velocity prediction accuracy and +provide a measurable uncertainty. We validated this innovative HV model using +real-world data from field experiments and applied it to develop a GP-enhanced +model predictive control (GP-MPC) strategy. This strategy aims to improve +safety in mixed vehicle platoons by integrating uncertainty assessment into +distance constraints. Comparative simulation studies with a conventional model +predictive control (MPC) approach demonstrated that our GP-MPC strategy ensures +more reliable safe distancing and fosters efficient vehicular dynamics, +achieving notably higher speeds within the platoon. By incorporating a sparse +GP technique in HV modeling and adopting a dynamic GP prediction within the MPC +framework, we significantly reduced the computation time of GP-MPC, marking it +only 4.6% higher than that of the conventional MPC. This represents a +substantial improvement, making the process about 100 times faster than our +preliminary work without these approximations. Our findings underscore the +effectiveness of learning-based HV modeling in enhancing both safety and +operational efficiency in mixed-traffic environments, paving the way for more +harmonious AV-HV interactions. + +
+
+ comment: in IEEE Transactions on Intelligent Transportation Systems, 2024 +
+
+
+
+
+ + ☆ A Data Efficient Framework for Learning Local Heuristics + + +
+ With the advent of machine learning, there have been several recent attempts +to learn effective and generalizable heuristics. Local Heuristic A* (LoHA*) is +one recent method that instead of learning the entire heuristic estimate, +learns a "local" residual heuristic that estimates the cost to escape a region +(Veerapaneni et al 2023). LoHA*, like other supervised learning methods, +collects a dataset of target values by querying an oracle on many planning +problems (in this case, local planning problems). This data collection process +can become slow as the size of the local region increases or if the domain +requires expensive collision checks. Our main insight is that when an A* search +solves a start-goal planning problem it inherently ends up solving multiple +local planning problems. We exploit this observation to propose an efficient +data collection framework that does <1/10th the amount of work (measured by +expansions) to collect the same amount of data in comparison to baselines. This +idea also enables us to run LoHA* in an online manner where we can iteratively +collect data and improve our model while solving relevant start-goal tasks. We +demonstrate the performance of our data collection and online framework on a 4D +$(x, y, \theta, v)$ navigation domain. + +
+
+ comment: Accepted in the 17th International Symposium on Combinatorial Search + (SoCS 2024) +
+
+
+
+
+ + ☆ Fast and Accurate Relative Motion Tracking for Two Industrial Robots + + +
+ Industrial robotic applications such as spraying, welding, and additive +manufacturing frequently require fast, accurate, and uniform motion along a 3D +spatial curve. To increase process throughput, some manufacturers propose a +dual-robot setup to overcome the speed limitation of a single robot. Industrial +robot motion is programmed through waypoints connected by motion primitives +(Cartesian linear and circular paths and linear joint paths at constant +Cartesian speed). The actual robot motion is affected by the blending between +these motion primitives and the pose of the robot (an outstretched/close to +singularity pose tends to have larger path-tracking errors). Choosing the +waypoints and the speed along each motion segment to achieve the performance +requirement is challenging. At present, there is no automated solution, and +laborious manual tuning by robot experts is needed to approach the desired +performance. In this paper, we present a systematic three-step approach to +designing and programming a dual-robot system to optimize system performance. +The first step is to select the relative placement between the two robots based +on the specified relative motion path. The second step is to select the +relative waypoints and the motion primitives. The final step is to update the +waypoints iteratively based on the actual relative motion. Waypoint iteration +is first executed in simulation and then completed using the actual robots. For +performance measures, we use the mean path speed subject to the relative +position and orientation constraints and the path speed uniformity constraint. +We have demonstrated the effectiveness of this method with ABB and FANUC robots +on two challenging test curves. The performance improvement over the current +industrial practice baseline is over 300%. Compared to the optimized single-arm +case that we have previously reported, the improvement is over 14%. + +
+
+
+
+
+ + ♻ ☆ Deep Learning for Inertial Sensor Alignment + + +
+ Accurate alignment of a fixed mobile device equipped with inertial sensors +inside a moving vehicle is important for navigation, activity recognition, and +other applications. Accurate estimation of the device mounting angle is +required to rotate the inertial measurement from the sensor frame to the moving +platform frame to standardize measurements and improve the performance of the +target task. In this work, a data-driven approach using deep neural networks +(DNNs) is proposed to learn the yaw mounting angle of a smartphone equipped +with an inertial measurement unit (IMU) and strapped to a car. The proposed +model uses only the accelerometer and gyroscope readings from an IMU as input +and, in contrast to existing solutions, does not require global position inputs +from global navigation satellite systems (GNSS). To train the model in a +supervised manner, IMU data is collected for training and validation with the +sensor mounted at a known yaw mounting angle, and a range of ground truth +labels is generated by applying a random rotation in a bounded range to the +measurements. The trained model is tested on data with real rotations showing +similar performance as with synthetic rotations. The trained model is deployed +on an Android device and evaluated in real-time to test the accuracy of the +estimated yaw mounting angle. The model is shown to find the mounting angle at +an accuracy of 8 degrees within 5 seconds, and 4 degrees within 27 seconds. An +experiment is conducted to compare the proposed model with an existing +off-the-shelf solution. + +
+
+ comment: 9 Pages, Preprint. Accepted IEEE +
+
+
+
+
+ + ♻ ☆ Femtosecond laser fabricated nitinol living hinges for millimeter-sized + robots RA-L + + +
+ Nitinol is a smart material that can be used as an actuator, a sensor, or a +structural element, and has the potential to significantly enhance the +capabilities of microrobots. Femtosecond laser technology can be used to +process nitinol while avoiding heat-affected zones (HAZ), thus retaining +superelastic properties. In this work, we manufacture living hinges of +arbitrary cross-sections from nitinol using a femtosecond laser micromachining +process. We first determined the laser cutting parameters, 4.1 Jcm^-2 fluence +with 5 passes for 5 um ablation, by varying laser power level and number of +passes. Next, we modeled the hinges using an analytical model as well as +creating an Abaqus finite element method, and showed the accuracy of the models +by comparing them to the torque produced by eight different hinges, four with a +rectangular cross-section and four with an arc cross-section. Finally, we +manufactured three prototype miniature devices to illustrate the usefulness of +these nitinol hinges: a sample spherical 5-bar mechanism, a sarrus linkage, and +a piezoelectric actuated robotic wing mechanism. + +
+
+ comment: 7 pages, 4 figures, submitted to IEEE RA-L +
+
+
+
+
+ + ♻ ☆ Optimizing Base Placement of Surgical Robot: Kinematics Data-Driven + Approach by Analyzing Working Pattern + + +
+ In robot-assisted minimally invasive surgery (RAMIS), optimal placement of +the surgical robot base is crucial for successful surgery. Improper placement +can hinder performance because of manipulator limitations and inaccessible +workspaces. Conventional base placement relies on the experience of trained +medical staff. This study proposes a novel method for determining the optimal +base pose based on the surgeon's working pattern. The proposed method analyzes +recorded end-effector poses using a machine learning-based clustering technique +to identify key positions and orientations preferred by the surgeon. We +introduce two scoring metrics to address the joint limit and singularity +issues: joint margin and manipulability scores. We then train a multi-layer +perceptron regressor to predict the optimal base pose based on these scores. +Evaluation in a simulated environment using the da Vinci Research Kit shows +unique base pose score maps for four volunteers, highlighting the individuality +of the working patterns. Results comparing with 20,000 randomly selected base +poses suggest that the score obtained using the proposed method is 28.2% higher +than that obtained by random base placement. These results emphasize the need +for operator-specific optimization during base placement in RAMIS. + +
+
+ comment: 8 pages, 7 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ GLiDR: Topologically Regularized Graph Generative Network for Sparse + LiDAR Point Clouds CVPR + + +
+ Sparse LiDAR point clouds cause severe loss of detail of static structures +and reduce the density of static points available for navigation. Reduced +density can be detrimental to navigation under several scenarios. We observe +that despite high sparsity, in most cases, the global topology of LiDAR +outlining the static structures can be inferred. We utilize this property to +obtain a backbone skeleton of a LiDAR scan in the form of a single connected +component that is a proxy to its global topology. We utilize the backbone to +augment new points along static structures to overcome sparsity. Newly +introduced points could correspond to existing static structures or to static +points that were earlier obstructed by dynamic objects. To the best of our +knowledge, we are the first to use such a strategy for sparse LiDAR point +clouds. Existing solutions close to our approach fail to identify and preserve +the global static LiDAR topology and generate sub-optimal points. We propose +GLiDR, a Graph Generative network that is topologically regularized using +0-dimensional Persistent Homology ($\mathcal{PH}$) constraints. This enables +GLiDR to introduce newer static points along a topologically consistent global +static LiDAR backbone. GLiDR generates precise static points using $32\times$ +sparser dynamic scans and performs better than the baselines across three +datasets. GLiDR generates a valuable byproduct - an accurate binary +segmentation mask of static and dynamic objects that are helpful for navigation +planning and safety in constrained environments. The newly introduced static +points allow GLiDR to outperform LiDAR-based navigation using SLAM in several +settings. Source code is available at +$\texttt{https://github.com/GLiDR-CVPR2024/GLiDR}$. + +
+
+ comment: IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR) +
+
+
+
+
+ + ♻ ☆ Understanding Video Transformers via Universal Concept Discovery CVPR 2024 + + +
+ This paper studies the problem of concept-based interpretability of +transformer representations for videos. Concretely, we seek to explain the +decision-making process of video transformers based on high-level, +spatiotemporal concepts that are automatically discovered. Prior research on +concept-based interpretability has concentrated solely on image-level tasks. +Comparatively, video models deal with the added temporal dimension, increasing +complexity and posing challenges in identifying dynamic concepts over time. In +this work, we systematically address these challenges by introducing the first +Video Transformer Concept Discovery (VTCD) algorithm. To this end, we propose +an efficient approach for unsupervised identification of units of video +transformer representations - concepts, and ranking their importance to the +output of a model. The resulting concepts are highly interpretable, revealing +spatio-temporal reasoning mechanisms and object-centric representations in +unstructured video models. Performing this analysis jointly over a diverse set +of supervised and self-supervised representations, we discover that some of +these mechanism are universal in video transformers. Finally, we show that VTCD +can be used for fine-grained action recognition and video object segmentation. + +
+
+ comment: CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ Exploring the Influence of Driving Context on Lateral Driving Style + Preferences: A Simulator-Based Study + + +
+ Technological advancements focus on developing comfortable and acceptable +driving characteristics in autonomous vehicles. Present driving functions +predominantly possess predefined parameters, and there is no universally +accepted driving style for autonomous vehicles. While driving may be +technically safe and the likelihood of road accidents is reduced, passengers +may still feel insecure due to a mismatch in driving styles between the human +and the autonomous system. Incorporating driving style preferences into +automated vehicles enhances acceptance, reduces uncertainty, and poses the +opportunity to expedite their adoption. Despite the increased research focus on +driving styles, there remains a need for comprehensive studies investigating +how variations in the driving context impact the assessment of automated +driving functions. Therefore, this work evaluates lateral driving style +preferences for autonomous vehicles on rural roads, considering different +weather and traffic situations. A controlled study was conducted with a variety +of German participants utilizing a high-fidelity driving simulator. The +subjects experienced four different driving styles, including mimicking of +their own driving behavior under two weather conditions. A notable preference +for a more passive driving style became evident based on statistical analyses +of participants' responses during and after the drives. This study could not +confirm the hypothesis that subjects prefer to be driven by mimicking their own +driving behavior. Furthermore, the study illustrated that weather conditions +and oncoming traffic substantially influence the perceived comfort during +autonomous rides. The gathered dataset is openly accessible at +https://www.kaggle.com/datasets/jhaselberger/idcld-subject-study-on-driving-style-preferences. + +
+
+ comment: 19 pages, 5 figures; This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ♻ ☆ Prediction Horizon Requirements for Automated Driving: Optimizing + Safety, Comfort, and Efficiency + + +
+ Predicting the movement of other road users is beneficial for improving +automated vehicle (AV) performance. However, the relationship between the time +horizon associated with these predictions and AV performance remains unclear. +Despite the existence of numerous trajectory prediction algorithms, no studies +have been conducted on how varying prediction lengths affect AV safety and +other vehicle performance metrics, resulting in undefined horizon requirements +for prediction methods. Our study addresses this gap by examining the effects +of different prediction horizons on AV performance, focusing on safety, +comfort, and efficiency. Through multiple experiments using a state-of-the-art, +risk-based predictive trajectory planner, we simulated predictions with +horizons up to 20 seconds. Based on our simulations, we propose a framework for +specifying the minimum required and optimal prediction horizons based on +specific AV performance criteria and application needs. Our results indicate +that a horizon of 1.6 seconds is required to prevent collisions with crossing +pedestrians, horizons of 7-8 seconds yield the best efficiency, and horizons up +to 15 seconds improve passenger comfort. We conclude that prediction horizon +requirements are application-dependent, and recommend aiming for a prediction +horizon of 11.8 seconds as a general guideline for applications involving +crossing pedestrians. + +
+
+ comment: Submitted to IEEE Intelligent Vehicles Symposium. 9 pages. 10 + figures. 6 tables +
+
+
+
+
+ + ♻ ☆ Local Observability of VINS and LINS + + +
+ This work analyzes unobservable directions of Vision-aided Inertial +Navigation System (VINS) and Lidar-aided Inertial Navigation System (LINS) +nonlinear model. Under the assumption that there exist two features observed by +the camera without occlusion, the unobservable directions of VINS are uniformly +globally translation and global rotations about the gravity vector. The +unobservable directions of LINS are same as VINS, while only one feature need +to be observed. Also, a constraint in Observability-Constrained VINS (OC-VINS) +is proved. + +
+
+
+
+
+ + ♻ ☆ Towards a Safe Real-Time Motion Planning Framework for Autonomous + Driving Systems: An MPPI Approach + + +
+ Planning safe trajectories in Autonomous Driving Systems (ADS) is a complex +problem to solve in real-time. The main challenge to solve this problem arises +from the various conditions and constraints imposed by road geometry, semantics +and traffic rules, as well as the presence of dynamic agents. Recently, Model +Predictive Path Integral (MPPI) has shown to be an effective framework for +optimal motion planning and control in robot navigation in unstructured and +highly uncertain environments. In this paper, we formulate the motion planning +problem in ADS as a nonlinear stochastic dynamic optimization problem that can +be solved using an MPPI strategy. The main technical contribution of this work +is a method to handle obstacles within the MPPI formulation safely. In this +method, obstacles are approximated by circles that can be easily integrated +into the MPPI cost formulation while considering safety margins. The proposed +MPPI framework has been efficiently implemented in our autonomous vehicle and +experimentally validated using three different primitive scenarios. +Experimental results show that generated trajectories are safe, feasible and +perfectly achieve the planning objective. The video results as well as the +open-source implementation are available at: +https://gitlab.uni.lu/360lab-public/mppi + +
+
+
+
+
+ + ♻ ☆ Multi S-Graphs: An Efficient Distributed Semantic-Relational + Collaborative SLAM RA-L + + +
+ Collaborative Simultaneous Localization and Mapping (CSLAM) is critical to +enable multiple robots to operate in complex environments. Most CSLAM +techniques rely on raw sensor measurement or low-level features such as +keyframe descriptors, which can lead to wrong loop closures due to the lack of +deep understanding of the environment. Moreover, the exchange of these +measurements and low-level features among the robots requires the transmission +of a significant amount of data, which limits the scalability of the system. To +overcome these limitations, we present Multi S-Graphs, a decentralized CSLAM +system that utilizes high-level semantic-relational information embedded in the +four-layered hierarchical and optimizable situational graphs for cooperative +map generation and localization in structured environments while minimizing the +information exchanged between the robots. To support this, we present a novel +room-based descriptor which, along with its connected walls, is used to perform +inter-robot loop closures, addressing the challenges of multi-robot kidnapped +problem initialization. Multiple experiments in simulated and real environments +validate the improvement in accuracy and robustness of the proposed approach +while reducing the amount of data exchanged between robots compared to other +state-of-the-art approaches. + Software available within a docker image: +https://github.com/snt-arg/multi_s_graphs_docker + +
+
+ comment: 8 pages paper presented to IEEE RA-L +
+
+
+
+
+ + ♻ ☆ Hysteresis Compensation of Flexible Continuum Manipulator using RGBD + Sensing and Temporal Convolutional Network + + +
+ Flexible continuum manipulators are valued for minimally invasive surgery, +offering access to confined spaces through nonlinear paths. However, +cable-driven manipulators face control difficulties due to hysteresis from +cabling effects such as friction, elongation, and coupling. These effects are +difficult to model due to nonlinearity and the difficulties become even more +evident when dealing with long and coupled, multi-segmented manipulator. This +paper proposes a data-driven approach based on Deep Neural Networks (DNN) to +capture these nonlinear and previous states-dependent characteristics of cable +actuation. We collect physical joint configurations according to command joint +configurations using RGBD sensing and 7 fiducial markers to model the +hysteresis of the proposed manipulator. Result on a study comparing the +estimation performance of four DNN models show that the Temporal Convolution +Network (TCN) demonstrates the highest predictive capability. Leveraging +trained TCNs, we build a control algorithm to compensate for hysteresis. +Tracking tests in task space using unseen trajectories show that the proposed +control algorithm reduces the average position and orientation error by 61.39% +(from 13.7mm to 5.29 mm) and 64.04% (from 31.17{\deg} to 11.21{\deg}), +respectively. This result implies that the proposed calibrated controller +effectively reaches the desired configurations by estimating the hysteresis of +the manipulator. Applying this method in real surgical scenarios has the +potential to enhance control precision and improve surgical performance. + +
+
+ comment: 8 pages, 11 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Ultra-Range Gesture Recognition using a Web-Camera in Human-Robot + Interaction + + +
+ Hand gestures play a significant role in human interactions where non-verbal +intentions, thoughts and commands are conveyed. In Human-Robot Interaction +(HRI), hand gestures offer a similar and efficient medium for conveying clear +and rapid directives to a robotic agent. However, state-of-the-art vision-based +methods for gesture recognition have been shown to be effective only up to a +user-camera distance of seven meters. Such a short distance range limits +practical HRI with, for example, service robots, search and rescue robots and +drones. In this work, we address the Ultra-Range Gesture Recognition (URGR) +problem by aiming for a recognition distance of up to 25 meters and in the +context of HRI. We propose the URGR framework, a novel deep-learning, using +solely a simple RGB camera. Gesture inference is based on a single image. +First, a novel super-resolution model termed High-Quality Network (HQ-Net) uses +a set of self-attention and convolutional layers to enhance the low-resolution +image of the user. Then, we propose a novel URGR classifier termed Graph Vision +Transformer (GViT) which takes the enhanced image as input. GViT combines the +benefits of a Graph Convolutional Network (GCN) and a modified Vision +Transformer (ViT). Evaluation of the proposed framework over diverse test data +yields a high recognition rate of 98.1%. The framework has also exhibited +superior performance compared to human recognition in ultra-range distances. +With the framework, we analyze and demonstrate the performance of an autonomous +quadruped robot directed by human gestures in complex ultra-range indoor and +outdoor environments, acquiring 96% recognition rate on average. + +
+
+ comment: Engineering Applications of Artificial Intelligence, In press +
+
+
+
+
+ + ♻ ☆ Discovering Closed-Loop Failures of Vision-Based Controllers via + Reachability Analysis + + +
+ Machine learning driven image-based controllers allow robotic systems to take +intelligent actions based on the visual feedback from their environment. +Understanding when these controllers might lead to system safety violations is +important for their integration in safety-critical applications and engineering +corrective safety measures for the system. Existing methods leverage +simulation-based testing (or falsification) to find the failures of +vision-based controllers, i.e., the visual inputs that lead to closed-loop +safety violations. However, these techniques do not scale well to the scenarios +involving high-dimensional and complex visual inputs, such as RGB images. In +this work, we cast the problem of finding closed-loop vision failures as a +Hamilton-Jacobi (HJ) reachability problem. Our approach blends simulation-based +analysis with HJ reachability methods to compute an approximation of the +backward reachable tube (BRT) of the system, i.e., the set of unsafe states for +the system under vision-based controllers. Utilizing the BRT, we can tractably +and systematically find the system states and corresponding visual inputs that +lead to closed-loop failures. These visual inputs can be subsequently analyzed +to find the input characteristics that might have caused the failure. Besides +its scalability to high-dimensional visual inputs, an explicit computation of +BRT allows the proposed approach to capture non-trivial system failures that +are difficult to expose via random simulations. We demonstrate our framework on +two case studies involving an RGB image-based neural network controller for (a) +autonomous indoor navigation, and (b) autonomous aircraft taxiing. + +
+
+
+
+
+ + ♻ ☆ ReFeree: Radar-based efficient global descriptor using a Feature and + Free space for Place Recognition + + +
+ Radar is highlighted for robust sensing capabilities in adverse weather +conditions (e.g. dense fog, heavy rain, or snowfall). In addition, Radar can +cover wide areas and penetrate small particles. Despite these advantages, +Radar-based place recognition remains in the early stages compared to other +sensors due to its unique characteristics such as low resolution, and +significant noise. In this paper, we propose a Radarbased place recognition +utilizing a descriptor called ReFeree using a feature and free space. Unlike +traditional methods, we overwhelmingly summarize the Radar image. Despite being +lightweight, it contains semi-metric information and is also outstanding from +the perspective of place recognition performance. For concrete validation, we +test a single session from the MulRan dataset and a multi-session from the +Oxford Offroad Radar, Oxford Radar RobotCar, and the Boreas dataset. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Sensor-based Multi-Robot Coverage Control with Spatial Separation in + Unstructured Environments + + +
+ Multi-robot systems have increasingly become instrumental in tackling search +and coverage problems. However, the challenge of optimizing task efficiency +without compromising task success still persists, particularly in expansive, +unstructured environments with dense obstacles. + This paper presents an innovative, decentralized Voronoi-based approach for +search and coverage to reactively navigate these complexities while maintaining +safety. + This approach leverages the active sensing capabilities of multi-robot +systems to supplement GIS (Geographic Information System), offering a more +comprehensive and real-time understanding of the environment. Based on point +cloud data, which is inherently non-convex and unstructured, this method +efficiently generates collision-free Voronoi regions using only local sensing +information through spatial decomposition and spherical mirroring techniques. + Then, deadlock-aware guided map integrated with a gradient-optimized, +centroid Voronoi-based coverage control policy, is constructed to improve +efficiency by avoiding exhaustive searches and local sensing pitfalls. + The effectiveness of our algorithm has been validated through extensive +numerical simulations in high-fidelity environments, demonstrating significant +improvements in both task success rate, coverage ratio, and task execution time +compared with others. + +
+
+
+
+
+ + ♻ ☆ Verification of Neural Reachable Tubes via Scenario Optimization and + Conformal Prediction + + +
+ Learning-based approaches for controlling safety-critical systems are rapidly +growing in popularity; thus, it is important to assure their performance and +safety. Hamilton-Jacobi (HJ) reachability analysis is a popular formal +verification tool for providing such guarantees, since it can handle general +nonlinear system dynamics, bounded adversarial system disturbances, and state +and input constraints. However, its computational and memory complexity scales +exponentially with the state dimension, making it intractable for large-scale +systems. To overcome this challenge, neural approaches, such as DeepReach, have +been used to synthesize reachable tubes and safety controllers for +high-dimensional systems. However, verifying these neural reachable tubes +remains challenging. In this work, we propose two verification methods, based +on robust scenario optimization and conformal prediction, to provide +probabilistic safety guarantees for neural reachable tubes. Our methods allow a +direct trade-off between resilience to outlier errors in the neural tube, which +are inevitable in a learning-based approach, and the strength of the +probabilistic safety guarantee. Furthermore, we show that split conformal +prediction, a widely used method in the machine learning community for +uncertainty quantification, reduces to a scenario-based approach, making the +two methods equivalent not only for verification of neural reachable tubes but +also more generally. To our knowledge, our proof is the first in the literature +to show a strong relationship between conformal prediction and scenario +optimization. Finally, we propose an outlier-adjusted verification approach +that uses the error distribution in neural reachable tubes to recover greater +safe volumes. We demonstrate the efficacy of the proposed approaches for the +high-dimensional problems of multi-vehicle collision avoidance and rocket +landing with no-go zones. + +
+
+ comment: Accepted to 6th Annual Learning for Dynamics & Control Conference. + arXiv admin note: text overlap with arXiv:2209.12336 +
+
+
+
+
+ + ♻ ☆ CitDet: A Benchmark Dataset for Citrus Fruit Detection RA-L + + +
+ In this letter, we present a new dataset to advance the state of the art in +detecting citrus fruit and accurately estimate yield on trees affected by the +Huanglongbing (HLB) disease in orchard environments via imaging. Despite the +fact that significant progress has been made in solving the fruit detection +problem, the lack of publicly available datasets has complicated direct +comparison of results. For instance, citrus detection has long been of interest +to the agricultural research community, yet there is an absence of work, +particularly involving public datasets of citrus affected by HLB. To address +this issue, we enhance state-of-the-art object detection methods for use in +typical orchard settings. Concretely, we provide high-resolution images of +citrus trees located in an area known to be highly affected by HLB, along with +high-quality bounding box annotations of citrus fruit. Fruit on both the trees +and the ground are labeled to allow for identification of fruit location, which +contributes to advancements in yield estimation and potential measure of HLB +impact via fruit drop. The dataset consists of over 32,000 bounding box +annotations for fruit instances contained in 579 high-resolution images. In +summary, our contributions are the following: (i) we introduce a novel dataset +along with baseline performance benchmarks on multiple contemporary object +detection algorithms, (ii) we show the ability to accurately capture fruit +location on tree or on ground, and finally (ii) we present a correlation of our +results with yield estimations. + +
+
+ comment: Submitted to IEEE Robotics and Automation Letters (RA-L) +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 119 + +
+
+
+ + ☆ GoodDrag: Towards Good Practices for Drag Editing with Diffusion Models + + +
+ In this paper, we introduce GoodDrag, a novel approach to improve the +stability and image quality of drag editing. Unlike existing methods that +struggle with accumulated perturbations and often result in distortions, +GoodDrag introduces an AlDD framework that alternates between drag and +denoising operations within the diffusion process, effectively improving the +fidelity of the result. We also propose an information-preserving motion +supervision operation that maintains the original features of the starting +point for precise manipulation and artifact reduction. In addition, we +contribute to the benchmarking of drag editing by introducing a new dataset, +Drag100, and developing dedicated quality assessment metrics, Dragging Accuracy +Index and Gemini Score, utilizing Large Multimodal Models. Extensive +experiments demonstrate that the proposed GoodDrag compares favorably against +the state-of-the-art approaches both qualitatively and quantitatively. The +project page is https://gooddrag.github.io. + +
+
+
+
+
+ + ☆ BRAVE: Broadening the visual encoding of vision-language models + + +
+ Vision-language models (VLMs) are typically composed of a vision encoder, +e.g. CLIP, and a language model (LM) that interprets the encoded features to +solve downstream tasks. Despite remarkable progress, VLMs are subject to +several shortcomings due to the limited capabilities of vision encoders, e.g. +"blindness" to certain image features, visual hallucination, etc. To address +these issues, we study broadening the visual encoding capabilities of VLMs. We +first comprehensively benchmark several vision encoders with different +inductive biases for solving VLM tasks. We observe that there is no single +encoding configuration that consistently achieves top performance across +different tasks, and encoders with different biases can perform surprisingly +similarly. Motivated by this, we introduce a method, named BRAVE, that +consolidates features from multiple frozen encoders into a more versatile +representation that can be directly fed as the input to a frozen LM. BRAVE +achieves state-of-the-art performance on a broad range of captioning and VQA +benchmarks and significantly reduces the aforementioned issues of VLMs, while +requiring a smaller number of trainable parameters than existing methods and +having a more compressed representation. Our results highlight the potential of +incorporating different visual biases for a more broad and contextualized +visual understanding of VLMs. + +
+
+ comment: Project page at https://brave-vlms.epfl.ch/ +
+
+
+
+
+ + ☆ UMBRAE: Unified Multimodal Decoding of Brain Signals + + +
+ We address prevailing challenges of the brain-powered research, departing +from the observation that the literature hardly recover accurate spatial +information and require subject-specific models. To address these challenges, +we propose UMBRAE, a unified multimodal decoding of brain signals. First, to +extract instance-level conceptual and spatial details from neural signals, we +introduce an efficient universal brain encoder for multimodal-brain alignment +and recover object descriptions at multiple levels of granularity from +subsequent multimodal large language model (MLLM). Second, we introduce a +cross-subject training strategy mapping subject-specific features to a common +feature space. This allows a model to be trained on multiple subjects without +extra resources, even yielding superior results compared to subject-specific +models. Further, we demonstrate this supports weakly-supervised adaptation to +new subjects, with only a fraction of the total training data. Experiments +demonstrate that UMBRAE not only achieves superior results in the newly +introduced tasks but also outperforms methods in well established tasks. To +assess our method, we construct and share with the community a comprehensive +brain understanding benchmark BrainHub. Our code and benchmark are available at +https://weihaox.github.io/UMBRAE. + +
+
+ comment: Project Page: https://weihaox.github.io/UMBRAE +
+
+
+
+
+ + ☆ RealmDreamer: Text-Driven 3D Scene Generation with Inpainting and Depth + Diffusion + + +
+ We introduce RealmDreamer, a technique for generation of general +forward-facing 3D scenes from text descriptions. Our technique optimizes a 3D +Gaussian Splatting representation to match complex text prompts. We initialize +these splats by utilizing the state-of-the-art text-to-image generators, +lifting their samples into 3D, and computing the occlusion volume. We then +optimize this representation across multiple views as a 3D inpainting task with +image-conditional diffusion models. To learn correct geometric structure, we +incorporate a depth diffusion model by conditioning on the samples from the +inpainting model, giving rich geometric structure. Finally, we finetune the +model using sharpened samples from image generators. Notably, our technique +does not require video or multi-view data and can synthesize a variety of +high-quality 3D scenes in different styles, consisting of multiple objects. Its +generality additionally allows 3D synthesis from a single image. + +
+
+ comment: Project Page: https://realmdreamer.github.io/ +
+
+
+
+
+ + ☆ InstantMesh: Efficient 3D Mesh Generation from a Single Image with + Sparse-view Large Reconstruction Models + + +
+ We present InstantMesh, a feed-forward framework for instant 3D mesh +generation from a single image, featuring state-of-the-art generation quality +and significant training scalability. By synergizing the strengths of an +off-the-shelf multiview diffusion model and a sparse-view reconstruction model +based on the LRM architecture, InstantMesh is able to create diverse 3D assets +within 10 seconds. To enhance the training efficiency and exploit more +geometric supervisions, e.g, depths and normals, we integrate a differentiable +iso-surface extraction module into our framework and directly optimize on the +mesh representation. Experimental results on public datasets demonstrate that +InstantMesh significantly outperforms other latest image-to-3D baselines, both +qualitatively and quantitatively. We release all the code, weights, and demo of +InstantMesh, with the intention that it can make substantial contributions to +the community of 3D generative AI and empower both researchers and content +creators. + +
+
+ comment: Technical report. Project: https://github.com/TencentARC/InstantMesh +
+
+
+
+
+ + ☆ GCV-Turbo: End-to-end Acceleration of GNN-based Computer Vision Tasks on + FPGA + + +
+ Graph neural networks (GNNs) have recently empowered various novel computer +vision (CV) tasks. In GNN-based CV tasks, a combination of CNN layers and GNN +layers or only GNN layers are employed. This paper introduces GCV-Turbo, a +domain-specific accelerator on FPGA for end-to-end acceleration of GNN-based CV +tasks. GCV-Turbo consists of two key components: (1) a \emph{novel} hardware +architecture optimized for the computation kernels in both CNNs and GNNs using +the same set of computation resources. (2) a PyTorch-compatible compiler that +takes a user-defined model as input, performs end-to-end optimization for the +computation graph of a given GNN-based CV task, and produces optimized code for +hardware execution. The hardware architecture and the compiler work +synergistically to support a variety of GNN-based CV tasks. We implement +GCV-Turbo on a state-of-the-art FPGA and evaluate its performance across six +representative GNN-based CV tasks with diverse input data modalities (e.g., +image, human skeleton, point cloud). Compared with state-of-the-art CPU (GPU) +implementations, GCV-Turbo achieves an average latency reduction of +$68.4\times$ ($4.1\times$) on these six GNN-based CV tasks. Moreover, GCV-Turbo +supports the execution of the standalone CNNs or GNNs, achieving performance +comparable to that of state-of-the-art CNN (GNN) accelerators for widely used +CNN-only (GNN-only) models. + +
+
+
+
+
+ + ☆ Move Anything with Layered Scene Diffusion CVPR 2024 + + +
+ Diffusion models generate images with an unprecedented level of quality, but +how can we freely rearrange image layouts? Recent works generate controllable +scenes via learning spatially disentangled latent codes, but these methods do +not apply to diffusion models due to their fixed forward process. In this work, +we propose SceneDiffusion to optimize a layered scene representation during the +diffusion sampling process. Our key insight is that spatial disentanglement can +be obtained by jointly denoising scene renderings at different spatial layouts. +Our generated scenes support a wide range of spatial editing operations, +including moving, resizing, cloning, and layer-wise appearance editing +operations, including object restyling and replacing. Moreover, a scene can be +generated conditioned on a reference image, thus enabling object moving for +in-the-wild images. Notably, this approach is training-free, compatible with +general text-to-image diffusion models, and responsive in less than a second. + +
+
+ comment: CVPR 2024 camera-ready +
+
+
+
+
+ + ☆ Self-supervised Monocular Depth Estimation on Water Scenes via Specular + Reflection Prior + + +
+ Monocular depth estimation from a single image is an ill-posed problem for +computer vision due to insufficient reliable cues as the prior knowledge. +Besides the inter-frame supervision, namely stereo and adjacent frames, +extensive prior information is available in the same frame. Reflections from +specular surfaces, informative intra-frame priors, enable us to reformulate the +ill-posed depth estimation task as a multi-view synthesis. This paper proposes +the first self-supervision for deep-learning depth estimation on water scenes +via intra-frame priors, known as reflection supervision and geometrical +constraints. In the first stage, a water segmentation network is performed to +separate the reflection components from the entire image. Next, we construct a +self-supervised framework to predict the target appearance from reflections, +perceived as other perspectives. The photometric re-projection error, +incorporating SmoothL1 and a novel photometric adaptive SSIM, is formulated to +optimize pose and depth estimation by aligning the transformed virtual depths +and source ones. As a supplement, the water surface is determined from real and +virtual camera positions, which complement the depth of the water area. +Furthermore, to alleviate these laborious ground truth annotations, we +introduce a large-scale water reflection scene (WRS) dataset rendered from +Unreal Engine 4. Extensive experiments on the WRS dataset prove the feasibility +of the proposed method compared to state-of-the-art depth estimation +techniques. + +
+
+ comment: 16 pages, 8 figures +
+
+
+
+
+ + ☆ Unified Language-driven Zero-shot Domain Adaptation CVPR 2024 + + +
+ This paper introduces Unified Language-driven Zero-shot Domain Adaptation +(ULDA), a novel task setting that enables a single model to adapt to diverse +target domains without explicit domain-ID knowledge. We identify the +constraints in the existing language-driven zero-shot domain adaptation task, +particularly the requirement for domain IDs and domain-specific models, which +may restrict flexibility and scalability. To overcome these issues, we propose +a new framework for ULDA, consisting of Hierarchical Context Alignment (HCA), +Domain Consistent Representation Learning (DCRL), and Text-Driven Rectifier +(TDR). These components work synergistically to align simulated features with +target text across multiple visual levels, retain semantic correlations between +different regional representations, and rectify biases between simulated and +real target visual features, respectively. Our extensive empirical evaluations +demonstrate that this framework achieves competitive performance in both +settings, surpassing even the model that requires domain-ID, showcasing its +superiority and generalization ability. The proposed method is not only +effective but also maintains practicality and efficiency, as it does not +introduce additional computational costs during inference. Our project page is +https://senqiaoyang.com/project/ULDA . + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Lost in Translation: Modern Neural Networks Still Struggle With Small + Realistic Image Transformations + + +
+ Deep neural networks that achieve remarkable performance in image +classification have previously been shown to be easily fooled by tiny +transformations such as a one pixel translation of the input image. In order to +address this problem, two approaches have been proposed in recent years. The +first approach suggests using huge datasets together with data augmentation in +the hope that a highly varied training set will teach the network to learn to +be invariant. The second approach suggests using architectural modifications +based on sampling theory to deal explicitly with image translations. In this +paper, we show that these approaches still fall short in robustly handling +'natural' image translations that simulate a subtle change in camera +orientation. Our findings reveal that a mere one-pixel translation can result +in a significant change in the predicted image representation for approximately +40% of the test images in state-of-the-art models (e.g. open-CLIP trained on +LAION-2B or DINO-v2) , while models that are explicitly constructed to be +robust to cyclic translations can still be fooled with 1 pixel realistic +(non-cyclic) translations 11% of the time. We present Robust Inference by Crop +Selection: a simple method that can be proven to achieve any desired level of +consistency, although with a modest tradeoff with the model's accuracy. +Importantly, we demonstrate how employing this method reduces the ability to +fool state-of-the-art models with a 1 pixel translation to less than 5% while +suffering from only a 1% drop in classification accuracy. Additionally, we show +that our method can be easy adjusted to deal with circular shifts as well. In +such case we achieve 100% robustness to integer shifts with state-of-the-art +accuracy, and with no need for any further training. + +
+
+ comment: 14 pages, 6 appendices, 17 figures +
+
+
+
+
+ + ☆ Measuring proximity to standard planes during fetal brain ultrasound + scanning + + +
+ This paper introduces a novel pipeline designed to bring ultrasound (US) +plane pose estimation closer to clinical use for more effective navigation to +the standard planes (SPs) in the fetal brain. We propose a semi-supervised +segmentation model utilizing both labeled SPs and unlabeled 3D US volume +slices. Our model enables reliable segmentation across a diverse set of fetal +brain images. Furthermore, the model incorporates a classification mechanism to +identify the fetal brain precisely. Our model not only filters out frames +lacking the brain but also generates masks for those containing it, enhancing +the relevance of plane pose regression in clinical settings. We focus on fetal +brain navigation from 2D ultrasound (US) video analysis and combine this model +with a US plane pose regression network to provide sensorless proximity +detection to SPs and non-SPs planes; we emphasize the importance of proximity +detection to SPs for guiding sonographers, offering a substantial advantage +over traditional methods by allowing earlier and more precise adjustments +during scanning. We demonstrate the practical applicability of our approach +through validation on real fetal scan videos obtained from sonographers of +varying expertise levels. Our findings demonstrate the potential of our +approach to complement existing fetal US technologies and advance prenatal +diagnostic practices. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Driver Attention Tracking and Analysis + + +
+ We propose a novel method to estimate a driver's points-of-gaze using a pair +of ordinary cameras mounted on the windshield and dashboard of a car. This is a +challenging problem due to the dynamics of traffic environments with 3D scenes +of unknown depths. This problem is further complicated by the volatile distance +between the driver and the camera system. To tackle these challenges, we +develop a novel convolutional network that simultaneously analyzes the image of +the scene and the image of the driver's face. This network has a camera +calibration module that can compute an embedding vector that represents the +spatial configuration between the driver and the camera system. This +calibration module improves the overall network's performance, which can be +jointly trained end to end. + We also address the lack of annotated data for training and evaluation by +introducing a large-scale driving dataset with point-of-gaze annotations. This +is an in situ dataset of real driving sessions in an urban city, containing +synchronized images of the driving scene as well as the face and gaze of the +driver. Experiments on this dataset show that the proposed method outperforms +various baseline methods, having the mean prediction error of 29.69 pixels, +which is relatively small compared to the $1280{\times}720$ resolution of the +scene camera. + +
+
+
+
+
+ + ☆ Unfolding ADMM for Enhanced Subspace Clustering of Hyperspectral Images + + +
+ Deep subspace clustering methods are now prominent in clustering, typically +using fully connected networks and a self-representation loss function. +However, these methods often struggle with overfitting and lack +interpretability. In this paper, we explore an alternative clustering approach +based on deep unfolding. By unfolding iterative optimization methods into +neural networks, this approach offers enhanced interpretability and reliability +compared to data-driven deep learning methods, and greater adaptability and +generalization than model-based approaches. Hence, unfolding has become widely +used in inverse imaging problems, such as image restoration, reconstruction, +and super-resolution, but has not been sufficiently explored yet in the context +of clustering. In this work, we introduce an innovative clustering architecture +for hyperspectral images (HSI) by unfolding an iterative solver based on the +Alternating Direction Method of Multipliers (ADMM) for sparse subspace +clustering. To our knowledge, this is the first attempt to apply unfolding ADMM +for computing the self-representation matrix in subspace clustering. Moreover, +our approach captures well the structural characteristics of HSI data by +employing the K nearest neighbors algorithm as part of a structure preservation +module. Experimental evaluation of three established HSI datasets shows clearly +the potential of the unfolding approach in HSI clustering and even demonstrates +superior performance compared to state-of-the-art techniques. + +
+
+
+
+
+ + ☆ Wild Visual Navigation: Fast Traversability Learning via Pre-Trained + Models and Online Self-Supervision + + +
+ Natural environments such as forests and grasslands are challenging for +robotic navigation because of the false perception of rigid obstacles from high +grass, twigs, or bushes. In this work, we present Wild Visual Navigation (WVN), +an online self-supervised learning system for visual traversability estimation. +The system is able to continuously adapt from a short human demonstration in +the field, only using onboard sensing and computing. One of the key ideas to +achieve this is the use of high-dimensional features from pre-trained +self-supervised models, which implicitly encode semantic information that +massively simplifies the learning task. Further, the development of an online +scheme for supervision generator enables concurrent training and inference of +the learned model in the wild. We demonstrate our approach through diverse +real-world deployments in forests, parks, and grasslands. Our system is able to +bootstrap the traversable terrain segmentation in less than 5 min of in-field +training time, enabling the robot to navigate in complex, previously unseen +outdoor terrains. Code: https://bit.ly/498b0CV - Project +page:https://bit.ly/3M6nMHH + +
+
+ comment: Extended version of arXiv:2305.08510 +
+
+
+
+
+ + ☆ 3DMambaComplete: Exploring Structured State Space Model for Point Cloud + Completion + + +
+ Point cloud completion aims to generate a complete and high-fidelity point +cloud from an initially incomplete and low-quality input. A prevalent strategy +involves leveraging Transformer-based models to encode global features and +facilitate the reconstruction process. However, the adoption of pooling +operations to obtain global feature representations often results in the loss +of local details within the point cloud. Moreover, the attention mechanism +inherent in Transformers introduces additional computational complexity, +rendering it challenging to handle long sequences effectively. To address these +issues, we propose 3DMambaComplete, a point cloud completion network built on +the novel Mamba framework. It comprises three modules: HyperPoint Generation +encodes point cloud features using Mamba's selection mechanism and predicts a +set of Hyperpoints. A specific offset is estimated, and the down-sampled points +become HyperPoints. The HyperPoint Spread module disperses these HyperPoints +across different spatial locations to avoid concentration. Finally, a +deformation method transforms the 2D mesh representation of HyperPoints into a +fine-grained 3D structure for point cloud reconstruction. Extensive experiments +conducted on various established benchmarks demonstrate that 3DMambaComplete +surpasses state-of-the-art point cloud completion methods, as confirmed by +qualitative and quantitative analyses. + +
+
+ comment: 10 pages, 8 figures, 7 tables +
+
+
+
+
+ + ☆ Learning Priors for Non Rigid SfM from Casual Videos + + +
+ We tackle the long-standing challenge of reconstructing 3D structures and +camera positions from videos. The problem is particularly hard when objects are +transformed in a non-rigid way. Current approaches to this problem make +unrealistic assumptions or require a long optimization time. + We present TracksTo4D, a novel deep learning-based approach that enables +inferring 3D structure and camera positions from dynamic content originating +from in-the-wild videos using a single feed-forward pass on a sparse point +track matrix. To achieve this, we leverage recent advances in 2D point tracking +and design an equivariant neural architecture tailored for directly processing +2D point tracks by leveraging their symmetries. TracksTo4D is trained on a +dataset of in-the-wild videos utilizing only the 2D point tracks extracted from +the videos, without any 3D supervision. Our experiments demonstrate that +TracksTo4D generalizes well to unseen videos of unseen semantic categories at +inference time, producing equivalent results to state-of-the-art methods while +significantly reducing the runtime compared to other baselines. + +
+
+
+
+
+ + ☆ MoCap-to-Visual Domain Adaptation for Efficient Human Mesh Estimation + from 2D Keypoints CVPR + + +
+ This paper presents Key2Mesh, a model that takes a set of 2D human pose +keypoints as input and estimates the corresponding body mesh. Since this +process does not involve any visual (i.e. RGB image) data, the model can be +trained on large-scale motion capture (MoCap) datasets, thereby overcoming the +scarcity of image datasets with 3D labels. To enable the model's application on +RGB images, we first run an off-the-shelf 2D pose estimator to obtain the 2D +keypoints, and then feed these 2D keypoints to Key2Mesh. To improve the +performance of our model on RGB images, we apply an adversarial domain +adaptation (DA) method to bridge the gap between the MoCap and visual domains. +Crucially, our DA method does not require 3D labels for visual data, which +enables adaptation to target sets without the need for costly labels. We +evaluate Key2Mesh for the task of estimating 3D human meshes from 2D keypoints, +in the absence of RGB and mesh label pairs. Our results on widely used H3.6M +and 3DPW datasets show that Key2Mesh sets the new state-of-the-art by +outperforming other models in PA-MPJPE for both datasets, and in MPJPE and PVE +for the 3DPW dataset. Thanks to our model's simple architecture, it operates at +least 12x faster than the prior state-of-the-art model, LGD. Additional +qualitative samples and code are available on the project website: +https://key2mesh.github.io/. + +
+
+ comment: accepted to CVPRW 2024 +
+
+
+
+
+ + ☆ VLLMs Provide Better Context for Emotion Understanding Through Common + Sense Reasoning + + +
+ Recognising emotions in context involves identifying the apparent emotions of +an individual, taking into account contextual cues from the surrounding scene. +Previous approaches to this task have involved the design of explicit +scene-encoding architectures or the incorporation of external scene-related +information, such as captions. However, these methods often utilise limited +contextual information or rely on intricate training pipelines. In this work, +we leverage the groundbreaking capabilities of Vision-and-Large-Language Models +(VLLMs) to enhance in-context emotion classification without introducing +complexity to the training process in a two-stage approach. In the first stage, +we propose prompting VLLMs to generate descriptions in natural language of the +subject's apparent emotion relative to the visual context. In the second stage, +the descriptions are used as contextual information and, along with the image +input, are used to train a transformer-based architecture that fuses text and +visual features before the final classification task. Our experimental results +show that the text and image features have complementary information, and our +fused architecture significantly outperforms the individual modalities without +any complex training methods. We evaluate our approach on three different +datasets, namely, EMOTIC, CAER-S, and BoLD, and achieve state-of-the-art or +comparable accuracy across all datasets and metrics compared to much more +complex approaches. The code will be made publicly available on github: +https://github.com/NickyFot/EmoCommonSense.git + +
+
+ comment: A. Xenos, N. Foteinopoulou and I. Ntinou contributed equally to this + work; 14 pages, 5 figures +
+
+
+
+
+ + ☆ Implicit Multi-Spectral Transformer: An Lightweight and Effective + Visible to Infrared Image Translation Model + + +
+ In the field of computer vision, visible light images often exhibit low +contrast in low-light conditions, presenting a significant challenge. While +infrared imagery provides a potential solution, its utilization entails high +costs and practical limitations. Recent advancements in deep learning, +particularly the deployment of Generative Adversarial Networks (GANs), have +facilitated the transformation of visible light images to infrared images. +However, these methods often experience unstable training phases and may +produce suboptimal outputs. To address these issues, we propose a novel +end-to-end Transformer-based model that efficiently converts visible light +images into high-fidelity infrared images. Initially, the Texture Mapping +Module and Color Perception Adapter collaborate to extract texture and color +features from the visible light image. The Dynamic Fusion Aggregation Module +subsequently integrates these features. Finally, the transformation into an +infrared image is refined through the synergistic action of the Color +Perception Adapter and the Enhanced Perception Attention mechanism. +Comprehensive benchmarking experiments confirm that our model outperforms +existing methods, producing infrared images of markedly superior quality, both +qualitatively and quantitatively. Furthermore, the proposed model enables more +effective downstream applications for infrared images than other methods. + +
+
+ comment: Accepted by IJCNN 2024 +
+
+
+
+
+ + ☆ Identification of Fine-grained Systematic Errors via Controlled Scene + Generation + + +
+ Many safety-critical applications, especially in autonomous driving, require +reliable object detectors. They can be very effectively assisted by a method to +search for and identify potential failures and systematic errors before these +detectors are deployed. Systematic errors are characterized by combinations of +attributes such as object location, scale, orientation, and color, as well as +the composition of their respective backgrounds. To identify them, one must +rely on something other than real images from a test set because they do not +account for very rare but possible combinations of attributes. To overcome this +limitation, we propose a pipeline for generating realistic synthetic scenes +with fine-grained control, allowing the creation of complex scenes with +multiple objects. Our approach, BEV2EGO, allows for a realistic generation of +the complete scene with road-contingent control that maps 2D bird's-eye view +(BEV) scene configurations to a first-person view (EGO). In addition, we +propose a benchmark for controlled scene generation to select the most +appropriate generative outpainting model for BEV2EGO. We further use it to +perform a systematic analysis of multiple state-of-the-art object detection +models and discover differences between them. + +
+
+
+
+
+ + ☆ An Evidential-enhanced Tri-Branch Consistency Learning Method for + Semi-supervised Medical Image Segmentation + + +
+ Semi-supervised segmentation presents a promising approach for large-scale +medical image analysis, effectively reducing annotation burdens while achieving +comparable performance. This methodology holds substantial potential for +streamlining the segmentation process and enhancing its feasibility within +clinical settings for translational investigations. While cross-supervised +training, based on distinct co-training sub-networks, has become a prevalent +paradigm for this task, addressing critical issues such as predication +disagreement and label-noise suppression requires further attention and +progress in cross-supervised training. In this paper, we introduce an +Evidential Tri-Branch Consistency learning framework (ETC-Net) for +semi-supervised medical image segmentation. ETC-Net employs three branches: an +evidential conservative branch, an evidential progressive branch, and an +evidential fusion branch. The first two branches exhibit complementary +characteristics, allowing them to address prediction diversity and enhance +training stability. We also integrate uncertainty estimation from the +evidential learning into cross-supervised training, mitigating the negative +impact of erroneous supervision signals. Additionally, the evidential fusion +branch capitalizes on the complementary attributes of the first two branches +and leverages an evidence-based Dempster-Shafer fusion strategy, supervised by +more reliable and accurate pseudo-labels of unlabeled data. Extensive +experiments conducted on LA, Pancreas-CT, and ACDC datasets demonstrate that +ETC-Net surpasses other state-of-the-art methods for semi-supervised +segmentation. The code will be made available in the near future at +https://github.com/Medsemiseg. + +
+
+
+
+
+ + ☆ ORacle: Large Vision-Language Models for Knowledge-Guided Holistic OR + Domain Modeling + + +
+ Every day, countless surgeries are performed worldwide, each within the +distinct settings of operating rooms (ORs) that vary not only in their setups +but also in the personnel, tools, and equipment used. This inherent diversity +poses a substantial challenge for achieving a holistic understanding of the OR, +as it requires models to generalize beyond their initial training datasets. To +reduce this gap, we introduce ORacle, an advanced vision-language model +designed for holistic OR domain modeling, which incorporates multi-view and +temporal capabilities and can leverage external knowledge during inference, +enabling it to adapt to previously unseen surgical scenarios. This capability +is further enhanced by our novel data augmentation framework, which +significantly diversifies the training dataset, ensuring ORacle's proficiency +in applying the provided knowledge effectively. In rigorous testing, in scene +graph generation, and downstream tasks on the 4D-OR dataset, ORacle not only +demonstrates state-of-the-art performance but does so requiring less data than +existing models. Furthermore, its adaptability is displayed through its ability +to interpret unseen views, actions, and appearances of tools and equipment. +This demonstrates ORacle's potential to significantly enhance the scalability +and affordability of OR domain modeling and opens a pathway for future +advancements in surgical data science. We will release our code and data upon +acceptance. + +
+
+ comment: 11 pages, 3 figures, 7 tables +
+
+
+
+
+ + ☆ Diffusion-based inpainting of incomplete Euclidean distance matrices of + trajectories generated by a fractional Brownian motion + + +
+ Fractional Brownian trajectories (fBm) feature both randomness and strong +scale-free correlations, challenging generative models to reproduce the +intrinsic memory characterizing the underlying process. Here we test a +diffusion probabilistic model on a specific dataset of corrupted images +corresponding to incomplete Euclidean distance matrices of fBm at various +memory exponents $H$. Our dataset implies uniqueness of the data imputation in +the regime of low missing ratio, where the remaining partial graph is rigid, +providing the ground truth for the inpainting. We find that the conditional +diffusion generation stably reproduces the statistics of missing +fBm-distributed distances for different values of $H$ exponent. Furthermore, +while diffusion models have been recently shown to remember samples from the +training database, we show that diffusion-based inpainting behaves +qualitatively different from the database search with the increasing database +size. Finally, we apply our fBm-trained diffusion model with $H=1/3$ for +completion of chromosome distance matrices obtained in single-cell microscopy +experiments, showing its superiority over the standard bioinformatics +algorithms. Our source code is available on GitHub at +https://github.com/alobashev/diffusion_fbm. + +
+
+
+
+
+ + ☆ Ray-driven Spectral CT Reconstruction Based on Neural Base-Material + Fields + + +
+ In spectral CT reconstruction, the basis materials decomposition involves +solving a large-scale nonlinear system of integral equations, which is highly +ill-posed mathematically. This paper proposes a model that parameterizes the +attenuation coefficients of the object using a neural field representation, +thereby avoiding the complex calculations of pixel-driven projection +coefficient matrices during the discretization process of line integrals. It +introduces a lightweight discretization method for line integrals based on a +ray-driven neural field, enhancing the accuracy of the integral approximation +during the discretization process. The basis materials are represented as +continuous vector-valued implicit functions to establish a neural field +parameterization model for the basis materials. The auto-differentiation +framework of deep learning is then used to solve the implicit continuous +function of the neural base-material fields. This method is not limited by the +spatial resolution of reconstructed images, and the network has compact and +regular properties. Experimental validation shows that our method performs +exceptionally well in addressing the spectral CT reconstruction. Additionally, +it fulfils the requirements for the generation of high-resolution +reconstruction images. + +
+
+ comment: 14 pages,16 figures +
+
+
+
+
+ + ☆ Accurate Tennis Court Line Detection on Amateur Recorded Matches + + +
+ Typically, tennis court line detection is done by running +Hough-Line-Detection to find straight lines in the image, and then computing a +transformation matrix from the detected lines to create the final court +structure. We propose numerous improvements and enhancements to this algorithm, +including using pretrained State-of-the-Art shadow-removal and object-detection +ML models to make our line-detection more robust. Compared to the original +algorithm, our method can accurately detect lines on amateur, dirty courts. +When combined with a robust ball-tracking system, our method will enable +accurate, automatic refereeing for amateur and professional tennis matches +alike. + +
+
+ comment: Accepted to 5th International conference on Image, Video Processing + and Artificial Intelligence +
+
+
+
+
+ + ☆ TrajPRed: Trajectory Prediction with Region-based Relation Learning + + +
+ Forecasting human trajectories in traffic scenes is critical for safety +within mixed or fully autonomous systems. Human future trajectories are driven +by two major stimuli, social interactions, and stochastic goals. Thus, reliable +forecasting needs to capture these two stimuli. Edge-based relation modeling +represents social interactions using pairwise correlations from precise +individual states. Nevertheless, edge-based relations can be vulnerable under +perturbations. To alleviate these issues, we propose a region-based relation +learning paradigm that models social interactions via region-wise dynamics of +joint states, i.e., the changes in the density of crowds. In particular, +region-wise agent joint information is encoded within convolutional feature +grids. Social relations are modeled by relating the temporal changes of local +joint information from a global perspective. We show that region-based +relations are less susceptible to perturbations. In order to account for the +stochastic individual goals, we exploit a conditional variational autoencoder +to realize multi-goal estimation and diverse future prediction. Specifically, +we perform variational inference via the latent distribution, which is +conditioned on the correlation between input states and associated target +goals. Sampling from the latent distribution enables the framework to reliably +capture the stochastic behavior in test data. We integrate multi-goal +estimation and region-based relation learning to model the two stimuli, social +interactions, and stochastic goals, in a prediction framework. We evaluate our +framework on the ETH-UCY dataset and Stanford Drone Dataset (SDD). We show that +the diverse prediction better fits the ground truth when incorporating the +relation module. Our framework outperforms the state-of-the-art models on SDD +by $27.61\%$/$18.20\%$ of ADE/FDE metrics. + +
+
+
+
+
+ + ☆ V-MAD: Video-based Morphing Attack Detection in Operational Scenarios + + +
+ In response to the rising threat of the face morphing attack, this paper +introduces and explores the potential of Video-based Morphing Attack Detection +(V-MAD) systems in real-world operational scenarios. While current morphing +attack detection methods primarily focus on a single or a pair of images, V-MAD +is based on video sequences, exploiting the video streams often acquired by +face verification tools available, for instance, at airport gates. Through this +study, we show for the first time the advantages that the availability of +multiple probe frames can bring to the morphing attack detection task, +especially in scenarios where the quality of probe images is varied and might +be affected, for instance, by pose or illumination variations. Experimental +results on a real operational database demonstrate that video sequences +represent valuable information for increasing the robustness and performance of +morphing attack detection systems. + +
+
+
+
+
+ + ☆ Adversarial purification for no-reference image-quality metrics: + applicability study and new methods + + +
+ Recently, the area of adversarial attacks on image quality metrics has begun +to be explored, whereas the area of defences remains under-researched. In this +study, we aim to cover that case and check the transferability of adversarial +purification defences from image classifiers to IQA methods. In this paper, we +apply several widespread attacks on IQA models and examine the success of the +defences against them. The purification methodologies covered different +preprocessing techniques, including geometrical transformations, compression, +denoising, and modern neural network-based methods. Also, we address the +challenge of assessing the efficacy of a defensive methodology by proposing +ways to estimate output visual quality and the success of neutralizing attacks. +Defences were tested against attack on three IQA metrics -- Linearity, MetaIQA +and SPAQ. The code for attacks and defences is available at: (link is hidden +for a blind review). + +
+
+
+
+
+ + ☆ Accelerating Cardiac MRI Reconstruction with CMRatt: An Attention-Driven + Approach + + +
+ Cine cardiac magnetic resonance (CMR) imaging is recognised as the benchmark +modality for the comprehensive assessment of cardiac function. Nevertheless, +the acquisition process of cine CMR is considered as an impediment due to its +prolonged scanning time. One commonly used strategy to expedite the acquisition +process is through k-space undersampling, though it comes with a drawback of +introducing aliasing effects in the reconstructed image. Lately, deep +learning-based methods have shown remarkable results over traditional +approaches in rapidly achieving precise CMR reconstructed images. This study +aims to explore the untapped potential of attention mechanisms incorporated +with a deep learning model within the context of the CMR reconstruction +problem. We are motivated by the fact that attention has proven beneficial in +downstream tasks such as image classification and segmentation, but has not +been systematically analysed in the context of CMR reconstruction. Our primary +goal is to identify the strengths and potential limitations of attention +algorithms when integrated with a convolutional backbone model such as a U-Net. +To achieve this, we benchmark different state-of-the-art spatial and channel +attention mechanisms on the CMRxRecon dataset and quantitatively evaluate the +quality of reconstruction using objective metrics. Furthermore, inspired by the +best performing attention mechanism, we propose a new, simple yet effective, +attention pipeline specifically optimised for the task of cardiac image +reconstruction that outperforms other state-of-the-art attention methods. The +layer and model code will be made publicly available. + +
+
+ comment: This paper has been submitted for the 32nd European Signal Processing + Conference EUSIPCO 2024 in Lyon +
+
+
+
+
+ + ☆ Efficient and Generic Point Model for Lossless Point Cloud Attribute + Compression + + +
+ The past several years have witnessed the emergence of learned point cloud +compression (PCC) techniques. However, current learning-based lossless point +cloud attribute compression (PCAC) methods either suffer from high +computational complexity or deteriorated compression performance. Moreover, the +significant variations in point cloud scale and sparsity encountered in +real-world applications make developing an all-in-one neural model a +challenging task. In this paper, we propose PoLoPCAC, an efficient and generic +lossless PCAC method that achieves high compression efficiency and strong +generalizability simultaneously. We formulate lossless PCAC as the task of +inferring explicit distributions of attributes from group-wise autoregressive +priors. A progressive random grouping strategy is first devised to efficiently +resolve the point cloud into groups, and then the attributes of each group are +modeled sequentially from accumulated antecedents. A locality-aware attention +mechanism is utilized to exploit prior knowledge from context windows in +parallel. Since our method directly operates on points, it can naturally avoids +distortion caused by voxelization, and can be executed on point clouds with +arbitrary scale and density. Experiments show that our method can be instantly +deployed once trained on a Synthetic 2k-ShapeNet dataset while enjoying +continuous bit-rate reduction over the latest G-PCCv23 on various datasets +(ShapeNet, ScanNet, MVUB, 8iVFB). Meanwhile, our method reports shorter coding +time than G-PCCv23 on the majority of sequences with a lightweight model size +(2.6MB), which is highly attractive for practical applications. Dataset, code +and trained model are available at +https://github.com/I2-Multimedia-Lab/PoLoPCAC. + +
+
+
+
+
+ + ☆ HRVDA: High-Resolution Visual Document Assistant CVPR 2024 + + +
+ Leveraging vast training data, multimodal large language models (MLLMs) have +demonstrated formidable general visual comprehension capabilities and achieved +remarkable performance across various tasks. However, their performance in +visual document understanding still leaves much room for improvement. This +discrepancy is primarily attributed to the fact that visual document +understanding is a fine-grained prediction task. In natural scenes, MLLMs +typically use low-resolution images, leading to a substantial loss of visual +information. Furthermore, general-purpose MLLMs do not excel in handling +document-oriented instructions. In this paper, we propose a High-Resolution +Visual Document Assistant (HRVDA), which bridges the gap between MLLMs and +visual document understanding. This model employs a content filtering mechanism +and an instruction filtering module to separately filter out the +content-agnostic visual tokens and instruction-agnostic visual tokens, thereby +achieving efficient model training and inference for high-resolution images. In +addition, we construct a document-oriented visual instruction tuning dataset +and apply a multi-stage training strategy to enhance the model's document +modeling capabilities. Extensive experiments demonstrate that our model +achieves state-of-the-art performance across multiple document understanding +datasets, while maintaining training efficiency and inference speed comparable +to low-resolution models. + +
+
+ comment: Accepted to CVPR 2024 main conference +
+
+
+
+
+ + ☆ Sparse Global Matching for Video Frame Interpolation with Large Motion CVPR 2024 + + +
+ Large motion poses a critical challenge in Video Frame Interpolation (VFI) +task. Existing methods are often constrained by limited receptive fields, +resulting in sub-optimal performance when handling scenarios with large motion. +In this paper, we introduce a new pipeline for VFI, which can effectively +integrate global-level information to alleviate issues associated with large +motion. Specifically, we first estimate a pair of initial intermediate flows +using a high-resolution feature map for extracting local details. Then, we +incorporate a sparse global matching branch to compensate for flow estimation, +which consists of identifying flaws in initial flows and generating sparse flow +compensation with a global receptive field. Finally, we adaptively merge the +initial flow estimation with global flow compensation, yielding a more accurate +intermediate flow. To evaluate the effectiveness of our method in handling +large motion, we carefully curate a more challenging subset from commonly used +benchmarks. Our method demonstrates the state-of-the-art performance on these +VFI subsets with large motion. + +
+
+ comment: Accepted by CVPR 2024. Project page: https://sgm-vfi.github.io/ +
+
+
+
+
+ + ☆ DreamScene360: Unconstrained Text-to-3D Scene Generation with Panoramic + Gaussian Splatting + + +
+ The increasing demand for virtual reality applications has highlighted the +significance of crafting immersive 3D assets. We present a text-to-3D +360$^{\circ}$ scene generation pipeline that facilitates the creation of +comprehensive 360$^{\circ}$ scenes for in-the-wild environments in a matter of +minutes. Our approach utilizes the generative power of a 2D diffusion model and +prompt self-refinement to create a high-quality and globally coherent panoramic +image. This image acts as a preliminary "flat" (2D) scene representation. +Subsequently, it is lifted into 3D Gaussians, employing splatting techniques to +enable real-time exploration. To produce consistent 3D geometry, our pipeline +constructs a spatially coherent structure by aligning the 2D monocular depth +into a globally optimized point cloud. This point cloud serves as the initial +state for the centroids of 3D Gaussians. In order to address invisible issues +inherent in single-view inputs, we impose semantic and geometric constraints on +both synthesized and input camera views as regularizations. These guide the +optimization of Gaussians, aiding in the reconstruction of unseen regions. In +summary, our method offers a globally consistent 3D scene within a +360$^{\circ}$ perspective, providing an enhanced immersive experience over +existing techniques. Project website at: http://dreamscene360.github.io/ + +
+
+
+
+
+ + ☆ O-TALC: Steps Towards Combating Oversegmentation within Online Action + Segmentation + + +
+ Online temporal action segmentation shows a strong potential to facilitate +many HRI tasks where extended human action sequences must be tracked and +understood in real time. Traditional action segmentation approaches, however, +operate in an offline two stage approach, relying on computationally expensive +video wide features for segmentation, rendering them unsuitable for online HRI +applications. In order to facilitate online action segmentation on a stream of +incoming video data, we introduce two methods for improved training and +inference of backbone action recognition models, allowing them to be deployed +directly for online frame level classification. Firstly, we introduce surround +dense sampling whilst training to facilitate training vs. inference clip +matching and improve segment boundary predictions. Secondly, we introduce an +Online Temporally Aware Label Cleaning (O-TALC) strategy to explicitly reduce +oversegmentation during online inference. As our methods are backbone +invariant, they can be deployed with computationally efficient spatio-temporal +action recognition models capable of operating in real time with a small +segmentation latency. We show our method outperforms similar online action +segmentation work as well as matches the performance of many offline models +with access to full temporal resolution when operating on challenging +fine-grained datasets. + +
+
+ comment: 5 pages, 3 figures. Accepted as a short (unindexed) paper at the + TAHRI conference +
+
+
+
+
+ + ☆ SparseAD: Sparse Query-Centric Paradigm for Efficient End-to-End + Autonomous Driving + + +
+ End-to-End paradigms use a unified framework to implement multi-tasks in an +autonomous driving system. Despite simplicity and clarity, the performance of +end-to-end autonomous driving methods on sub-tasks is still far behind the +single-task methods. Meanwhile, the widely used dense BEV features in previous +end-to-end methods make it costly to extend to more modalities or tasks. In +this paper, we propose a Sparse query-centric paradigm for end-to-end +Autonomous Driving (SparseAD), where the sparse queries completely represent +the whole driving scenario across space, time and tasks without any dense BEV +representation. Concretely, we design a unified sparse architecture for +perception tasks including detection, tracking, and online mapping. Moreover, +we revisit motion prediction and planning, and devise a more justifiable motion +planner framework. On the challenging nuScenes dataset, SparseAD achieves SOTA +full-task performance among end-to-end methods and significantly narrows the +performance gap between end-to-end paradigms and single-task methods. Codes +will be released soon. + +
+
+
+
+
+ + ☆ Research on Detection of Floating Objects in River and Lake Based on AI + Intelligent Image Recognition + + +
+ With the rapid advancement of artificial intelligence technology, AI-enabled +image recognition has emerged as a potent tool for addressing challenges in +traditional environmental monitoring. This study focuses on the detection of +floating objects in river and lake environments, exploring an innovative +approach based on deep learning. By intricately analyzing the technical +pathways for detecting static and dynamic features and considering the +characteristics of river and lake debris, a comprehensive image acquisition and +processing workflow has been developed. The study highlights the application +and performance comparison of three mainstream deep learning models -SSD, +Faster-RCNN, and YOLOv5- in debris identification. Additionally, a detection +system for floating objects has been designed and implemented, encompassing +both hardware platform construction and software framework development. Through +rigorous experimental validation, the proposed system has demonstrated its +ability to significantly enhance the accuracy and efficiency of debris +detection, thus offering a new technological avenue for water quality +monitoring in rivers and lakes + +
+
+
+
+
+ + ☆ Fine color guidance in diffusion models and its application to image + compression at extremely low bitrates + + +
+ This study addresses the challenge of, without training or fine-tuning, +controlling the global color aspect of images generated with a diffusion model. +We rewrite the guidance equations to ensure that the outputs are closer to a +known color map, and this without hindering the quality of the generation. Our +method leads to new guidance equations. We show in the color guidance context +that, the scaling of the guidance should not decrease but remains high +throughout the diffusion process. In a second contribution, our guidance is +applied in a compression framework, we combine both semantic and general color +information on the image to decode the images at low cost. We show that our +method is effective at improving fidelity and realism of compressed images at +extremely low bit rates, when compared to other classical or more semantic +oriented approaches. + +
+
+ comment: Submitted to IEEE Transactions on Image Processing (TIP) +
+
+
+
+
+ + ☆ RESSCAL3D: Resolution Scalable 3D Semantic Segmentation of Point Clouds + + +
+ While deep learning-based methods have demonstrated outstanding results in +numerous domains, some important functionalities are missing. Resolution +scalability is one of them. In this work, we introduce a novel architecture, +dubbed RESSCAL3D, providing resolution-scalable 3D semantic segmentation of +point clouds. In contrast to existing works, the proposed method does not +require the whole point cloud to be available to start inference. Once a +low-resolution version of the input point cloud is available, first semantic +predictions can be generated in an extremely fast manner. This enables early +decision-making in subsequent processing steps. As additional points become +available, these are processed in parallel. To improve performance, features +from previously computed scales are employed as prior knowledge at the current +scale. Our experiments show that RESSCAL3D is 31-62% faster than the +non-scalable baseline while keeping a limited impact on performance. To the +best of our knowledge, the proposed method is the first to propose a +resolution-scalable approach for 3D semantic segmentation of point clouds based +on deep learning. + +
+
+ comment: Published at 2023 IEEE International Conference on Image Processing + (ICIP) +
+
+
+
+
+ + Monocular 3D lane detection for Autonomous Driving: Recent Achievements, + Challenges, and Outlooks + + +
+ 3D lane detection plays a crucial role in autonomous driving by extracting +structural and traffic information from the road in 3D space to assist the +self-driving car in rational, safe, and comfortable path planning and motion +control. Due to the consideration of sensor costs and the advantages of visual +data in color information, in practical applications, 3D lane detection based +on monocular vision is one of the important research directions in the field of +autonomous driving, which has attracted more and more attention in both +industry and academia. Unfortunately, recent progress in visual perception +seems insufficient to develop completely reliable 3D lane detection algorithms, +which also hinders the development of vision-based fully autonomous +self-driving cars, i.e., achieving level 5 autonomous driving, driving like +human-controlled cars. This is one of the conclusions drawn from this review +paper: there is still a lot of room for improvement and significant +improvements are still needed in the 3D lane detection algorithm for autonomous +driving cars using visual sensors. Motivated by this, this review defines, +analyzes, and reviews the current achievements in the field of 3D lane +detection research, and the vast majority of the current progress relies +heavily on computationally complex deep learning models. In addition, this +review covers the 3D lane detection pipeline, investigates the performance of +state-of-the-art algorithms, analyzes the time complexity of cutting-edge +modeling choices, and highlights the main achievements and limitations of +current research efforts. The survey also includes a comprehensive discussion +of available 3D lane detection datasets and the challenges that researchers +have faced but have not yet resolved. Finally, our work outlines future +research directions and welcomes researchers and practitioners to enter this +exciting field. + +
+
+
+
+
+ + ☆ Multi-Label Continual Learning for the Medical Domain: A Novel Benchmark + + +
+ Multi-label image classification in dynamic environments is a problem that +poses significant challenges. Previous studies have primarily focused on +scenarios such as Domain Incremental Learning and Class Incremental Learning, +which do not fully capture the complexity of real-world applications. In this +paper, we study the problem of classification of medical imaging in the +scenario termed New Instances \& New Classes, which combines the challenges of +both new class arrivals and domain shifts in a single framework. Unlike +traditional scenarios, it reflects the realistic nature of CL in domains such +as medical imaging, where updates may introduce both new classes and changes in +domain characteristics. To address the unique challenges posed by this complex +scenario, we introduce a novel approach called Pseudo-Label Replay. This method +aims to mitigate forgetting while adapting to new classes and domain shifts by +combining the advantages of the Replay and Pseudo-Label methods and solving +their limitations in the proposed scenario. % part3 We evaluate our proposed +approach on a challenging benchmark consisting of two datasets, seven tasks, +and nineteen classes, modeling a realistic Continual Learning scenario. Our +experimental findings demonstrate the effectiveness of Pseudo-Label Replay in +addressing the challenges posed by the complex scenario proposed. Our method +surpasses existing approaches, exhibiting superior performance while showing +minimal forgetting. + +
+
+
+
+
+ + ☆ UDiFF: Generating Conditional Unsigned Distance Fields with Optimal + Wavelet Diffusion CVPR2024 + + +
+ Diffusion models have shown remarkable results for image generation, editing +and inpainting. Recent works explore diffusion models for 3D shape generation +with neural implicit functions, i.e., signed distance function and occupancy +function. However, they are limited to shapes with closed surfaces, which +prevents them from generating diverse 3D real-world contents containing open +surfaces. In this work, we present UDiFF, a 3D diffusion model for unsigned +distance fields (UDFs) which is capable to generate textured 3D shapes with +open surfaces from text conditions or unconditionally. Our key idea is to +generate UDFs in spatial-frequency domain with an optimal wavelet +transformation, which produces a compact representation space for UDF +generation. Specifically, instead of selecting an appropriate wavelet +transformation which requires expensive manual efforts and still leads to large +information loss, we propose a data-driven approach to learn the optimal +wavelet transformation for UDFs. We evaluate UDiFF to show our advantages by +numerical and visual comparisons with the latest methods on widely used +benchmarks. Page: https://weiqi-zhang.github.io/UDiFF. + +
+
+ comment: To appear at CVPR2024. Project page: + https://weiqi-zhang.github.io/UDiFF +
+
+
+
+
+ + ☆ MoCha-Stereo: Motif Channel Attention Network for Stereo Matching CVPR 2024 + + +
+ Learning-based stereo matching techniques have made significant progress. +However, existing methods inevitably lose geometrical structure information +during the feature channel generation process, resulting in edge detail +mismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network +(MoCha-Stereo) is designed to address this problem. We provide the Motif +Channel Correlation Volume (MCCV) to determine more accurate edge matching +costs. MCCV is achieved by projecting motif channels, which capture common +geometric structures in feature channels, onto feature maps and cost volumes. +In addition, edge variations in %potential feature channels of the +reconstruction error map also affect details matching, we propose the +Reconstruction Error Motif Penalty (REMP) module to further refine the +full-resolution disparity estimation. REMP integrates the frequency information +of typical channel features from the reconstruction error. MoCha-Stereo ranks +1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure +also shows excellent performance in Multi-View Stereo. Code is avaliable at +https://github.com/ZYangChen/MoCha-Stereo. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ O2V-Mapping: Online Open-Vocabulary Mapping with Neural Implicit + Representation + + +
+ Online construction of open-ended language scenes is crucial for robotic +applications, where open-vocabulary interactive scene understanding is +required. Recently, neural implicit representation has provided a promising +direction for online interactive mapping. However, implementing open-vocabulary +scene understanding capability into online neural implicit mapping still faces +three challenges: lack of local scene updating ability, blurry spatial +hierarchical semantic segmentation and difficulty in maintaining multi-view +consistency. To this end, we proposed O2V-mapping, which utilizes voxel-based +language and geometric features to create an open-vocabulary field, thus +allowing for local updates during online training process. Additionally, we +leverage a foundational model for image segmentation to extract language +features on object-level entities, achieving clear segmentation boundaries and +hierarchical semantic features. For the purpose of preserving consistency in 3D +object properties across different viewpoints, we propose a spatial adaptive +voxel adjustment mechanism and a multi-view weight selection method. Extensive +experiments on open-vocabulary object localization and semantic segmentation +demonstrate that O2V-mapping achieves online construction of language scenes +while enhancing accuracy, outperforming the previous SOTA method. + +
+
+
+
+
+ + ☆ Tuning-Free Adaptive Style Incorporation for Structure-Consistent + Text-Driven Style Transfer + + +
+ In this work, we target the task of text-driven style transfer in the context +of text-to-image (T2I) diffusion models. The main challenge is consistent +structure preservation while enabling effective style transfer effects. The +past approaches in this field directly concatenate the content and style +prompts for a prompt-level style injection, leading to unavoidable structure +distortions. In this work, we propose a novel solution to the text-driven style +transfer task, namely, Adaptive Style Incorporation~(ASI), to achieve +fine-grained feature-level style incorporation. It consists of the Siamese +Cross-Attention~(SiCA) to decouple the single-track cross-attention to a +dual-track structure to obtain separate content and style features, and the +Adaptive Content-Style Blending (AdaBlending) module to couple the content and +style information from a structure-consistent manner. Experimentally, our +method exhibits much better performance in both structure preservation and +stylized effects. + +
+
+
+
+
+ + ☆ SplatPose & Detect: Pose-Agnostic 3D Anomaly Detection CVPR 2024 + + +
+ Detecting anomalies in images has become a well-explored problem in both +academia and industry. State-of-the-art algorithms are able to detect defects +in increasingly difficult settings and data modalities. However, most current +methods are not suited to address 3D objects captured from differing poses. +While solutions using Neural Radiance Fields (NeRFs) have been proposed, they +suffer from excessive computation requirements, which hinder real-world +usability. For this reason, we propose the novel 3D Gaussian splatting-based +framework SplatPose which, given multi-view images of a 3D object, accurately +estimates the pose of unseen views in a differentiable manner, and detects +anomalies in them. We achieve state-of-the-art results in both training and +inference speed, and detection performance, even when using less training data +than competing methods. We thoroughly evaluate our framework using the recently +proposed Pose-agnostic Anomaly Detection benchmark and its multi-pose anomaly +detection (MAD) data set. + +
+
+ comment: Visual Anomaly and Novelty Detection 2.0 Workshop at CVPR 2024 +
+
+
+
+
+ + ☆ Zero-shot Point Cloud Completion Via 2D Priors + + +
+ 3D point cloud completion is designed to recover complete shapes from +partially observed point clouds. Conventional completion methods typically +depend on extensive point cloud data for training %, with their effectiveness +often constrained to object categories similar to those seen during training. +In contrast, we propose a zero-shot framework aimed at completing partially +observed point clouds across any unseen categories. Leveraging point rendering +via Gaussian Splatting, we develop techniques of Point Cloud Colorization and +Zero-shot Fractal Completion that utilize 2D priors from pre-trained diffusion +models to infer missing regions. Experimental results on both synthetic and +real-world scanned point clouds demonstrate that our approach outperforms +existing methods in completing a variety of objects without any requirement for +specific training data. + +
+
+
+
+
+ + ☆ MedRG: Medical Report Grounding with Multi-modal Large Language Model + + +
+ Medical Report Grounding is pivotal in identifying the most relevant regions +in medical images based on a given phrase query, a critical aspect in medical +image analysis and radiological diagnosis. However, prevailing visual grounding +approaches necessitate the manual extraction of key phrases from medical +reports, imposing substantial burdens on both system efficiency and physicians. +In this paper, we introduce a novel framework, Medical Report Grounding +(MedRG), an end-to-end solution for utilizing a multi-modal Large Language +Model to predict key phrase by incorporating a unique token, BOX, into the +vocabulary to serve as an embedding for unlocking detection capabilities. +Subsequently, the vision encoder-decoder jointly decodes the hidden embedding +and the input medical image, generating the corresponding grounding box. The +experimental results validate the effectiveness of MedRG, surpassing the +performance of the existing state-of-the-art medical phrase grounding methods. +This study represents a pioneering exploration of the medical report grounding +task, marking the first-ever endeavor in this domain. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Urban Architect: Steerable 3D Urban Scene Generation with Layout Prior + + +
+ Text-to-3D generation has achieved remarkable success via large-scale +text-to-image diffusion models. Nevertheless, there is no paradigm for scaling +up the methodology to urban scale. Urban scenes, characterized by numerous +elements, intricate arrangement relationships, and vast scale, present a +formidable barrier to the interpretability of ambiguous textual descriptions +for effective model optimization. In this work, we surmount the limitations by +introducing a compositional 3D layout representation into text-to-3D paradigm, +serving as an additional prior. It comprises a set of semantic primitives with +simple geometric structures and explicit arrangement relationships, +complementing textual descriptions and enabling steerable generation. Upon +this, we propose two modifications -- (1) We introduce Layout-Guided +Variational Score Distillation to address model optimization inadequacies. It +conditions the score distillation sampling process with geometric and semantic +constraints of 3D layouts. (2) To handle the unbounded nature of urban scenes, +we represent 3D scene with a Scalable Hash Grid structure, incrementally +adapting to the growing scale of urban scenes. Extensive experiments +substantiate the capability of our framework to scale text-to-3D generation to +large-scale urban scenes that cover over 1000m driving distance for the first +time. We also present various scene editing demonstrations, showing the powers +of steerable urban scene generation. Website: https://urbanarchitect.github.io. + +
+
+ comment: Project page: https://urbanarchitect.github.io/ +
+
+
+
+
+ + ☆ Efficient and Scalable Chinese Vector Font Generation via Component + Composition + + +
+ Chinese vector font generation is challenging due to the complex structure +and huge amount of Chinese characters. Recent advances remain limited to +generating a small set of characters with simple structure. In this work, we +first observe that most Chinese characters can be disassembled into +frequently-reused components. Therefore, we introduce the first efficient and +scalable Chinese vector font generation approach via component composition, +allowing generating numerous vector characters from a small set of components. +To achieve this, we collect a large-scale dataset that contains over +\textit{90K} Chinese characters with their components and layout information. +Upon the dataset, we propose a simple yet effective framework based on spatial +transformer networks (STN) and multiple losses tailored to font characteristics +to learn the affine transformation of the components, which can be directly +applied to the B\'ezier curves, resulting in Chinese characters in vector +format. Our qualitative and quantitative experiments have demonstrated that our +method significantly surpasses the state-of-the-art vector font generation +methods in generating large-scale complex Chinese characters in both font +generation and zero-shot font extension. + +
+
+ comment: 15 pages, 23 figures +
+
+
+
+
+ + ☆ Logit Calibration and Feature Contrast for Robust Federated Learning on + Non-IID Data + + +
+ Federated learning (FL) is a privacy-preserving distributed framework for +collaborative model training on devices in edge networks. However, challenges +arise due to vulnerability to adversarial examples (AEs) and the +non-independent and identically distributed (non-IID) nature of data +distribution among devices, hindering the deployment of adversarially robust +and accurate learning models at the edge. While adversarial training (AT) is +commonly acknowledged as an effective defense strategy against adversarial +attacks in centralized training, we shed light on the adverse effects of +directly applying AT in FL that can severely compromise accuracy, especially in +non-IID challenges. Given this limitation, this paper proposes FatCC, which +incorporates local logit \underline{C}alibration and global feature +\underline{C}ontrast into the vanilla federated adversarial training +(\underline{FAT}) process from both logit and feature perspectives. This +approach can effectively enhance the federated system's robust accuracy (RA) +and clean accuracy (CA). First, we propose logit calibration, where the logits +are calibrated during local adversarial updates, thereby improving adversarial +robustness. Second, FatCC introduces feature contrast, which involves a global +alignment term that aligns each local representation with unbiased global +features, thus further enhancing robustness and accuracy in federated +adversarial environments. Extensive experiments across multiple datasets +demonstrate that FatCC achieves comparable or superior performance gains in +both CA and RA compared to other baselines. + +
+
+
+
+
+ + ☆ Adapting LLaMA Decoder to Vision Transformer + + +
+ This work examines whether decoder-only Transformers such as LLaMA, which +were originally designed for large language models (LLMs), can be adapted to +the computer vision field. We first "LLaMAfy" a standard ViT step-by-step to +align with LLaMA's architecture, and find that directly applying a casual mask +to the self-attention brings an attention collapse issue, resulting in the +failure to the network training. We suggest to reposition the class token +behind the image tokens with a post-sequence class token technique to overcome +this challenge, enabling causal self-attention to efficiently capture the +entire image's information. Additionally, we develop a soft mask strategy that +gradually introduces a casual mask to the self-attention at the onset of +training to facilitate the optimization behavior. The tailored model, dubbed as +image LLaMA (iLLaMA), is akin to LLaMA in architecture and enables direct +supervised learning. Its causal self-attention boosts computational efficiency +and learns complex representation by elevating attention map ranks. iLLaMA +rivals the performance with its encoder-only counterparts, achieving 75.1% +ImageNet top-1 accuracy with only 5.7M parameters. Scaling the model to ~310M +and pre-training on ImageNet-21K further enhances the accuracy to 86.0%. +Extensive experiments demonstrate iLLaMA's reliable properties: calibration, +shape-texture bias, quantization compatibility, ADE20K segmentation and CIFAR +transfer learning. We hope our study can kindle fresh views to visual model +design in the wave of LLMs. Pre-trained models and codes are available here. + +
+
+ comment: 22 pages, 10 figures +
+
+
+
+
+ + ☆ MonoSelfRecon: Purely Self-Supervised Explicit Generalizable 3D + Reconstruction of Indoor Scenes from Monocular RGB Views + + +
+ Current monocular 3D scene reconstruction (3DR) works are either +fully-supervised, or not generalizable, or implicit in 3D representation. We +propose a novel framework - MonoSelfRecon that for the first time achieves +explicit 3D mesh reconstruction for generalizable indoor scenes with monocular +RGB views by purely self-supervision on voxel-SDF (signed distance function). +MonoSelfRecon follows an Autoencoder-based architecture, decodes voxel-SDF and +a generalizable Neural Radiance Field (NeRF), which is used to guide voxel-SDF +in self-supervision. We propose novel self-supervised losses, which not only +support pure self-supervision, but can be used together with supervised signals +to further boost supervised training. Our experiments show that "MonoSelfRecon" +trained in pure self-supervision outperforms current best self-supervised +indoor depth estimation models and is comparable to 3DR models trained in fully +supervision with depth annotations. MonoSelfRecon is not restricted by specific +model design, which can be used to any models with voxel-SDF for purely +self-supervised manner. + +
+
+
+
+
+ + ☆ YOLO based Ocean Eddy Localization with AWS SageMaker + + +
+ Ocean eddies play a significant role both on the sea surface and beneath it, +contributing to the sustainability of marine life dependent on oceanic +behaviors. Therefore, it is crucial to investigate ocean eddies to monitor +changes in the Earth, particularly in the oceans, and their impact on climate. +This study aims to pinpoint ocean eddies using AWS cloud services, specifically +SageMaker. The primary objective is to detect small-scale (<20km) ocean eddies +from satellite remote images and assess the feasibility of utilizing SageMaker, +which offers tools for deploying AI applications. Moreover, this research not +only explores the deployment of cloud-based services for remote sensing of +Earth data but also evaluates several YOLO (You Only Look Once) models using +single and multi-GPU-based services in the cloud. Furthermore, this study +underscores the potential of these services, their limitations, challenges +related to deployment and resource management, and their user-riendliness for +Earth science projects. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ An Animation-based Augmentation Approach for Action Recognition from + Discontinuous Video + + +
+ The study of action recognition has attracted considerable attention recently +due to its broad applications in multiple areas. However, with the issue of +discontinuous training video, which not only decreases the performance of +action recognition model, but complicates the data augmentation process as +well, still remains under-exploration. In this study, we introduce the 4A +(Action Animation-based Augmentation Approach), an innovative pipeline for data +augmentation to address the problem. The main contributions remain in our work +includes: (1) we investigate the problem of severe decrease on performance of +action recognition task training by discontinuous video, and the limitation of +existing augmentation methods on solving this problem. (2) we propose a novel +augmentation pipeline, 4A, to address the problem of discontinuous video for +training, while achieving a smoother and natural-looking action representation +than the latest data augmentation methodology. (3) We achieve the same +performance with only 10% of the original data for training as with all of the +original data from the real-world dataset, and a better performance on +In-the-wild videos, by employing our data augmentation techniques. + +
+
+
+
+
+ + ☆ Bayesian NeRF: Quantifying Uncertainty with Volume Density in Neural + Radiance Fields + + +
+ We present the Bayesian Neural Radiance Field (NeRF), which explicitly +quantifies uncertainty in geometric volume structures without the need for +additional networks, making it adept for challenging observations and +uncontrolled images. NeRF diverges from traditional geometric methods by +offering an enriched scene representation, rendering color and density in 3D +space from various viewpoints. However, NeRF encounters limitations in relaxing +uncertainties by using geometric structure information, leading to inaccuracies +in interpretation under insufficient real-world observations. Recent research +efforts aimed at addressing this issue have primarily relied on empirical +methods or auxiliary networks. To fundamentally address this issue, we propose +a series of formulational extensions to NeRF. By introducing generalized +approximations and defining density-related uncertainty, our method seamlessly +extends to manage uncertainty not only for RGB but also for depth, without the +need for additional networks or empirical assumptions. In experiments we show +that our method significantly enhances performance on RGB and depth images in +the comprehensive dataset, demonstrating the reliability of the Bayesian NeRF +approach to quantifying uncertainty based on the geometric structure. + +
+
+
+
+
+ + ☆ Sparse Points to Dense Clouds: Enhancing 3D Detection with Limited LiDAR + Data + + +
+ 3D detection is a critical task that enables machines to identify and locate +objects in three-dimensional space. It has a broad range of applications in +several fields, including autonomous driving, robotics and augmented reality. +Monocular 3D detection is attractive as it requires only a single camera, +however, it lacks the accuracy and robustness required for real world +applications. High resolution LiDAR on the other hand, can be expensive and +lead to interference problems in heavy traffic given their active +transmissions. We propose a balanced approach that combines the advantages of +monocular and point cloud-based 3D detection. Our method requires only a small +number of 3D points, that can be obtained from a low-cost, low-resolution +sensor. Specifically, we use only 512 points, which is just 1% of a full LiDAR +frame in the KITTI dataset. Our method reconstructs a complete 3D point cloud +from this limited 3D information combined with a single image. The +reconstructed 3D point cloud and corresponding image can be used by any +multi-modal off-the-shelf detector for 3D object detection. By using the +proposed network architecture with an off-the-shelf multi-modal 3D detector, +the accuracy of 3D detection improves by 20% compared to the state-of-the-art +monocular detection methods and 6% to 9% compare to the baseline multi-modal +methods on KITTI and JackRabbot datasets. + +
+
+
+
+
+ + ☆ SpikeNVS: Enhancing Novel View Synthesis from Blurry Images via Spike + Camera + + +
+ One of the most critical factors in achieving sharp Novel View Synthesis +(NVS) using neural field methods like Neural Radiance Fields (NeRF) and 3D +Gaussian Splatting (3DGS) is the quality of the training images. However, +Conventional RGB cameras are susceptible to motion blur. In contrast, +neuromorphic cameras like event and spike cameras inherently capture more +comprehensive temporal information, which can provide a sharp representation of +the scene as additional training data. Recent methods have explored the +integration of event cameras to improve the quality of NVS. The event-RGB +approaches have some limitations, such as high training costs and the inability +to work effectively in the background. Instead, our study introduces a new +method that uses the spike camera to overcome these limitations. By considering +texture reconstruction from spike streams as ground truth, we design the +Texture from Spike (TfS) loss. Since the spike camera relies on temporal +integration instead of temporal differentiation used by event cameras, our +proposed TfS loss maintains manageable training costs. It handles foreground +objects with backgrounds simultaneously. We also provide a real-world dataset +captured with our spike-RGB camera system to facilitate future research +endeavors. We conduct extensive experiments using synthetic and real-world +datasets to demonstrate that our design can enhance novel view synthesis across +NeRF and 3DGS. The code and dataset will be made available for public access. + +
+
+
+
+
+ + ☆ Convolution-based Probability Gradient Loss for Semantic Segmentation + + +
+ In this paper, we introduce a novel Convolution-based Probability Gradient +(CPG) loss for semantic segmentation. It employs convolution kernels similar to +the Sobel operator, capable of computing the gradient of pixel intensity in an +image. This enables the computation of gradients for both ground-truth and +predicted category-wise probabilities. It enhances network performance by +maximizing the similarity between these two probability gradients. Moreover, to +specifically enhance accuracy near the object's boundary, we extract the object +boundary based on the ground-truth probability gradient and exclusively apply +the CPG loss to pixels belonging to boundaries. CPG loss proves to be highly +convenient and effective. It establishes pixel relationships through +convolution, calculating errors from a distinct dimension compared to +pixel-wise loss functions such as cross-entropy loss. We conduct qualitative +and quantitative analyses to evaluate the impact of the CPG loss on three +well-established networks (DeepLabv3-Resnet50, HRNetV2-OCR, and +LRASPP_MobileNet_V3_Large) across three standard segmentation datasets +(Cityscapes, COCO-Stuff, ADE20K). Our extensive experimental results +consistently and significantly demonstrate that the CPG loss enhances the mean +Intersection over Union. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ Scaling Multi-Camera 3D Object Detection through Weak-to-Strong + Eliciting + + +
+ The emergence of Multi-Camera 3D Object Detection (MC3D-Det), facilitated by +bird's-eye view (BEV) representation, signifies a notable progression in 3D +object detection. Scaling MC3D-Det training effectively accommodates varied +camera parameters and urban landscapes, paving the way for the MC3D-Det +foundation model. However, the multi-view fusion stage of the MC3D-Det method +relies on the ill-posed monocular perception during training rather than +surround refinement ability, leading to what we term "surround refinement +degradation". To this end, our study presents a weak-to-strong eliciting +framework aimed at enhancing surround refinement while maintaining robust +monocular perception. Specifically, our framework employs weakly tuned experts +trained on distinct subsets, and each is inherently biased toward specific +camera configurations and scenarios. These biased experts can learn the +perception of monocular degeneration, which can help the multi-view fusion +stage to enhance surround refinement abilities. Moreover, a composite +distillation strategy is proposed to integrate the universal knowledge of 2D +foundation models and task-specific information. Finally, for MC3D-Det joint +training, the elaborate dataset merge strategy is designed to solve the problem +of inconsistent camera numbers and camera parameters. We set up a multiple +dataset joint training benchmark for MC3D-Det and adequately evaluated existing +methods. Further, we demonstrate the proposed framework brings a generalized +and significant boost over multiple baselines. Our code is at +\url{https://github.com/EnVision-Research/Scale-BEV}. + +
+
+
+
+
+ + ☆ Binomial Self-compensation for Motion Error in Dynamic 3D Scanning + + +
+ Phase shifting profilometry (PSP) is favored in high-precision 3D scanning +due to its high accuracy, robustness, and pixel-wise property. However, a +fundamental assumption of PSP that the object should remain static is violated +in dynamic measurement, making PSP susceptible to object moving, resulting in +ripple-like errors in the point clouds. We propose a pixel-wise and frame-wise +loopable binomial self-compensation (BSC) algorithm to effectively and flexibly +eliminate motion error in the four-step PSP. Our mathematical model +demonstrates that by summing successive motion-affected phase frames weighted +by binomial coefficients, motion error exponentially diminishes as the binomial +order increases, accomplishing automatic error compensation through the +motion-affected phase sequence, without the assistance of any intermediate +variable. Extensive experiments show that our BSC outperforms the existing +methods in reducing motion error, while achieving a depth map frame rate equal +to the camera's acquisition rate (90 fps), enabling high-accuracy 3D +reconstruction with a quasi-single-shot frame rate. + +
+
+
+
+
+ + ☆ Perception-Oriented Video Frame Interpolation via Asymmetric Blending CVPR 2024 + + +
+ Previous methods for Video Frame Interpolation (VFI) have encountered +challenges, notably the manifestation of blur and ghosting effects. These +issues can be traced back to two pivotal factors: unavoidable motion errors and +misalignment in supervision. In practice, motion estimates often prove to be +error-prone, resulting in misaligned features. Furthermore, the reconstruction +loss tends to bring blurry results, particularly in misaligned regions. To +mitigate these challenges, we propose a new paradigm called PerVFI +(Perception-oriented Video Frame Interpolation). Our approach incorporates an +Asymmetric Synergistic Blending module (ASB) that utilizes features from both +sides to synergistically blend intermediate features. One reference frame +emphasizes primary content, while the other contributes complementary +information. To impose a stringent constraint on the blending process, we +introduce a self-learned sparse quasi-binary mask which effectively mitigates +ghosting and blur artifacts in the output. Additionally, we employ a +normalizing flow-based generator and utilize the negative log-likelihood loss +to learn the conditional distribution of the output, which further facilitates +the generation of clear and fine details. Experimental results validate the +superiority of PerVFI, demonstrating significant improvements in perceptual +quality compared to existing methods. Codes are available at +\url{https://github.com/mulns/PerVFI} + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Unsupervised Visible-Infrared ReID via Pseudo-label Correction and + Modality-level Alignment + + +
+ Unsupervised visible-infrared person re-identification (UVI-ReID) has +recently gained great attention due to its potential for enhancing human +detection in diverse environments without labeling. Previous methods utilize +intra-modality clustering and cross-modality feature matching to achieve +UVI-ReID. However, there exist two challenges: 1) noisy pseudo labels might be +generated in the clustering process, and 2) the cross-modality feature +alignment via matching the marginal distribution of visible and infrared +modalities may misalign the different identities from two modalities. In this +paper, we first conduct a theoretic analysis where an interpretable +generalization upper bound is introduced. Based on the analysis, we then +propose a novel unsupervised cross-modality person re-identification framework +(PRAISE). Specifically, to address the first challenge, we propose a +pseudo-label correction strategy that utilizes a Beta Mixture Model to predict +the probability of mis-clustering based network's memory effect and rectifies +the correspondence by adding a perceptual term to contrastive learning. Next, +we introduce a modality-level alignment strategy that generates paired +visible-infrared latent features and reduces the modality gap by aligning the +labeling function of visible and infrared features to learn identity +discriminative and modality-invariant features. Experimental results on two +benchmark datasets demonstrate that our method achieves state-of-the-art +performance than the unsupervised visible-ReID methods. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ SafeGen: Mitigating Unsafe Content Generation in Text-to-Image Models + + +
+ Text-to-image (T2I) models, such as Stable Diffusion, have exhibited +remarkable performance in generating high-quality images from text descriptions +in recent years. However, text-to-image models may be tricked into generating +not-safe-for-work (NSFW) content, particularly in sexual scenarios. Existing +countermeasures mostly focus on filtering inappropriate inputs and outputs, or +suppressing improper text embeddings, which can block explicit NSFW-related +content (e.g., naked or sexy) but may still be vulnerable to adversarial +prompts inputs that appear innocent but are ill-intended. In this paper, we +present SafeGen, a framework to mitigate unsafe content generation by +text-to-image models in a text-agnostic manner. The key idea is to eliminate +unsafe visual representations from the model regardless of the text input. In +this way, the text-to-image model is resistant to adversarial prompts since +unsafe visual representations are obstructed from within. Extensive experiments +conducted on four datasets demonstrate SafeGen's effectiveness in mitigating +unsafe content generation while preserving the high-fidelity of benign images. +SafeGen outperforms eight state-of-the-art baseline methods and achieves 99.1% +sexual content removal performance. Furthermore, our constructed benchmark of +adversarial prompts provides a basis for future development and evaluation of +anti-NSFW-generation methods. + +
+
+
+
+
+ + ☆ Deep Generative Data Assimilation in Multimodal Setting CVPR2024 + + +
+ Robust integration of physical knowledge and data is key to improve +computational simulations, such as Earth system models. Data assimilation is +crucial for achieving this goal because it provides a systematic framework to +calibrate model outputs with observations, which can include remote sensing +imagery and ground station measurements, with uncertainty quantification. +Conventional methods, including Kalman filters and variational approaches, +inherently rely on simplifying linear and Gaussian assumptions, and can be +computationally expensive. Nevertheless, with the rapid adoption of data-driven +methods in many areas of computational sciences, we see the potential of +emulating traditional data assimilation with deep learning, especially +generative models. In particular, the diffusion-based probabilistic framework +has large overlaps with data assimilation principles: both allows for +conditional generation of samples with a Bayesian inverse framework. These +models have shown remarkable success in text-conditioned image generation or +image-controlled video synthesis. Likewise, one can frame data assimilation as +observation-conditioned state calibration. In this work, we propose SLAMS: +Score-based Latent Assimilation in Multimodal Setting. Specifically, we +assimilate in-situ weather station data and ex-situ satellite imagery to +calibrate the vertical temperature profiles, globally. Through extensive +ablation, we demonstrate that SLAMS is robust even in low-resolution, noisy, +and sparse data settings. To our knowledge, our work is the first to apply deep +generative framework for multimodal data assimilation using real-world +datasets; an important step for building robust computational simulators, +including the next-generation Earth system models. Our code is available at: +https://github.com/yongquan-qu/SLAMS + +
+
+ comment: Accepted to CVPR2024 EarthVision +
+
+
+
+
+ + ☆ Multi-modal Document Presentation Attack Detection With Forensics Trace + Disentanglement + + +
+ Document Presentation Attack Detection (DPAD) is an important measure in +protecting the authenticity of a document image. However, recent DPAD methods +demand additional resources, such as manual effort in collecting additional +data or knowing the parameters of acquisition devices. This work proposes a +DPAD method based on multi-modal disentangled traces (MMDT) without the above +drawbacks. We first disentangle the recaptured traces by a self-supervised +disentanglement and synthesis network to enhance the generalization capacity in +document images with different contents and layouts. Then, unlike the existing +DPAD approaches that rely only on data in the RGB domain, we propose to +explicitly employ the disentangled recaptured traces as new modalities in the +transformer backbone through adaptive multi-modal adapters to fuse RGB/trace +features efficiently. Visualization of the disentangled traces confirms the +effectiveness of the proposed method in different document contents. Extensive +experiments on three benchmark datasets demonstrate the superiority of our MMDT +method on representing forensic traces of recapturing distortion. + +
+
+ comment: Accepted to ICME 2024 +
+
+
+
+
+ + ☆ Efficient Denoising using Score Embedding in Score-based Diffusion + Models + + +
+ It is well known that training a denoising score-based diffusion models +requires tens of thousands of epochs and a substantial number of image data to +train the model. In this paper, we propose to increase the efficiency in +training score-based diffusion models. Our method allows us to decrease the +number of epochs needed to train the diffusion model. We accomplish this by +solving the log-density Fokker-Planck (FP) Equation numerically to compute the +score \textit{before} training. The pre-computed score is embedded into the +image to encourage faster training under slice Wasserstein distance. +Consequently, it also allows us to decrease the number of images we need to +train the neural network to learn an accurate score. We demonstrate through our +numerical experiments the improved performance of our proposed method compared +to standard score-based diffusion models. Our proposed method achieves a +similar quality to the standard method meaningfully faster. + +
+
+
+
+
+ + ♻ ☆ Disentangled Explanations of Neural Network Predictions by Finding + Relevant Subspaces + + +
+ Explainable AI aims to overcome the black-box nature of complex ML models +like neural networks by generating explanations for their predictions. +Explanations often take the form of a heatmap identifying input features (e.g. +pixels) that are relevant to the model's decision. These explanations, however, +entangle the potentially multiple factors that enter into the overall complex +decision strategy. We propose to disentangle explanations by extracting at some +intermediate layer of a neural network, subspaces that capture the multiple and +distinct activation patterns (e.g. visual concepts) that are relevant to the +prediction. To automatically extract these subspaces, we propose two new +analyses, extending principles found in PCA or ICA to explanations. These novel +analyses, which we call principal relevant component analysis (PRCA) and +disentangled relevant subspace analysis (DRSA), maximize relevance instead of +e.g. variance or kurtosis. This allows for a much stronger focus of the +analysis on what the ML model actually uses for predicting, ignoring +activations or concepts to which the model is invariant. Our approach is +general enough to work alongside common attribution techniques such as Shapley +Value, Integrated Gradients, or LRP. Our proposed methods show to be +practically useful and compare favorably to the state of the art as +demonstrated on benchmarks and three use cases. + +
+
+ comment: 17 pages + supplement +
+
+
+
+
+ + ♻ ☆ Deep Learning for Inertial Sensor Alignment + + +
+ Accurate alignment of a fixed mobile device equipped with inertial sensors +inside a moving vehicle is important for navigation, activity recognition, and +other applications. Accurate estimation of the device mounting angle is +required to rotate the inertial measurement from the sensor frame to the moving +platform frame to standardize measurements and improve the performance of the +target task. In this work, a data-driven approach using deep neural networks +(DNNs) is proposed to learn the yaw mounting angle of a smartphone equipped +with an inertial measurement unit (IMU) and strapped to a car. The proposed +model uses only the accelerometer and gyroscope readings from an IMU as input +and, in contrast to existing solutions, does not require global position inputs +from global navigation satellite systems (GNSS). To train the model in a +supervised manner, IMU data is collected for training and validation with the +sensor mounted at a known yaw mounting angle, and a range of ground truth +labels is generated by applying a random rotation in a bounded range to the +measurements. The trained model is tested on data with real rotations showing +similar performance as with synthetic rotations. The trained model is deployed +on an Android device and evaluated in real-time to test the accuracy of the +estimated yaw mounting angle. The model is shown to find the mounting angle at +an accuracy of 8 degrees within 5 seconds, and 4 degrees within 27 seconds. An +experiment is conducted to compare the proposed model with an existing +off-the-shelf solution. + +
+
+ comment: 9 Pages, Preprint. Accepted IEEE +
+
+
+
+
+ + ♻ ☆ GLiDR: Topologically Regularized Graph Generative Network for Sparse + LiDAR Point Clouds CVPR + + +
+ Sparse LiDAR point clouds cause severe loss of detail of static structures +and reduce the density of static points available for navigation. Reduced +density can be detrimental to navigation under several scenarios. We observe +that despite high sparsity, in most cases, the global topology of LiDAR +outlining the static structures can be inferred. We utilize this property to +obtain a backbone skeleton of a LiDAR scan in the form of a single connected +component that is a proxy to its global topology. We utilize the backbone to +augment new points along static structures to overcome sparsity. Newly +introduced points could correspond to existing static structures or to static +points that were earlier obstructed by dynamic objects. To the best of our +knowledge, we are the first to use such a strategy for sparse LiDAR point +clouds. Existing solutions close to our approach fail to identify and preserve +the global static LiDAR topology and generate sub-optimal points. We propose +GLiDR, a Graph Generative network that is topologically regularized using +0-dimensional Persistent Homology ($\mathcal{PH}$) constraints. This enables +GLiDR to introduce newer static points along a topologically consistent global +static LiDAR backbone. GLiDR generates precise static points using $32\times$ +sparser dynamic scans and performs better than the baselines across three +datasets. GLiDR generates a valuable byproduct - an accurate binary +segmentation mask of static and dynamic objects that are helpful for navigation +planning and safety in constrained environments. The newly introduced static +points allow GLiDR to outperform LiDAR-based navigation using SLAM in several +settings. Source code is available at +$\texttt{https://github.com/GLiDR-CVPR2024/GLiDR}$. + +
+
+ comment: IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR) +
+
+
+
+
+ + ♻ ☆ CLOVA: A Closed-Loop Visual Assistant with Tool Usage and Update CVPR 2024 + + +
+ Utilizing large language models (LLMs) to compose off-the-shelf visual tools +represents a promising avenue of research for developing robust visual +assistants capable of addressing diverse visual tasks. However, these methods +often overlook the potential for continual learning, typically by freezing the +utilized tools, thus limiting their adaptation to environments requiring new +knowledge. To tackle this challenge, we propose CLOVA, a Closed-Loop Visual +Assistant, which operates within a framework encompassing inference, +reflection, and learning phases. During the inference phase, LLMs generate +programs and execute corresponding tools to complete assigned tasks. In the +reflection phase, a multimodal global-local reflection scheme analyzes human +feedback to determine which tools require updating. Lastly, the learning phase +employs three flexible approaches to automatically gather training data and +introduces a novel prompt tuning scheme to update the tools, allowing CLOVA to +efficiently acquire new knowledge. Experimental findings demonstrate that CLOVA +surpasses existing tool-usage methods by 5% in visual question answering and +multiple-image reasoning, by 10% in knowledge tagging, and by 20% in image +editing. These results underscore the significance of the continual learning +capability in general visual assistants. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Bias-Reduced Neural Networks for Parameter Estimation in Quantitative + MRI + + +
+ Purpose: To develop neural network (NN)-based quantitative MRI parameter +estimators with minimal bias and a variance close to the Cram\'er-Rao bound. + Theory and Methods: We generalize the mean squared error loss to control the +bias and variance of the NN's estimates, which involves averaging over multiple +noise realizations of the same measurements during training. Bias and variance +properties of the resulting NNs are studied for two neuroimaging applications. + Results: In simulations, the proposed strategy reduces the estimates' bias +throughout parameter space and achieves a variance close to the Cram\'er-Rao +bound. In vivo, we observe good concordance between parameter maps estimated +with the proposed NNs and traditional estimators, such as non-linear +least-squares fitting, while state-of-the-art NNs show larger deviations. + Conclusion: The proposed NNs have greatly reduced bias compared to those +trained using the mean squared error and offer significantly improved +computational efficiency over traditional estimators with comparable or better +accuracy. + +
+
+
+
+
+ + ♻ ☆ MaskClustering: View Consensus based Mask Graph Clustering for + Open-Vocabulary 3D Instance Segmentation + + +
+ Open-vocabulary 3D instance segmentation is cutting-edge for its ability to +segment 3D instances without predefined categories. However, progress in 3D +lags behind its 2D counterpart due to limited annotated 3D data. To address +this, recent works first generate 2D open-vocabulary masks through 2D models +and then merge them into 3D instances based on metrics calculated between two +neighboring frames. In contrast to these local metrics, we propose a novel +metric, view consensus rate, to enhance the utilization of multi-view +observations. The key insight is that two 2D masks should be deemed part of the +same 3D instance if a significant number of other 2D masks from different views +contain both these two masks. Using this metric as edge weight, we construct a +global mask graph where each mask is a node. Through iterative clustering of +masks showing high view consensus, we generate a series of clusters, each +representing a distinct 3D instance. Notably, our model is training-free. +Through extensive experiments on publicly available datasets, including +ScanNet++, ScanNet200 and MatterPort3D, we demonstrate that our method achieves +state-of-the-art performance in open-vocabulary 3D instance segmentation. Our +project page is at https://pku-epic.github.io/MaskClustering. + +
+
+
+
+
+ + ♻ ☆ Visual Concept Connectome (VCC): Open World Concept Discovery and their + Interlayer Connections in Deep Models CVPR 2024 + + +
+ Understanding what deep network models capture in their learned +representations is a fundamental challenge in computer vision. We present a new +methodology to understanding such vision models, the Visual Concept Connectome +(VCC), which discovers human interpretable concepts and their interlayer +connections in a fully unsupervised manner. Our approach simultaneously reveals +fine-grained concepts at a layer, connection weightings across all layers and +is amendable to global analysis of network structure (e.g., branching pattern +of hierarchical concept assemblies). Previous work yielded ways to extract +interpretable concepts from single layers and examine their impact on +classification, but did not afford multilayer concept analysis across an entire +network architecture. Quantitative and qualitative empirical results show the +effectiveness of VCCs in the domain of image classification. Also, we leverage +VCCs for the application of failure mode debugging to reveal where mistakes +arise in deep networks. + +
+
+ comment: CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ Understanding Video Transformers via Universal Concept Discovery CVPR 2024 + + +
+ This paper studies the problem of concept-based interpretability of +transformer representations for videos. Concretely, we seek to explain the +decision-making process of video transformers based on high-level, +spatiotemporal concepts that are automatically discovered. Prior research on +concept-based interpretability has concentrated solely on image-level tasks. +Comparatively, video models deal with the added temporal dimension, increasing +complexity and posing challenges in identifying dynamic concepts over time. In +this work, we systematically address these challenges by introducing the first +Video Transformer Concept Discovery (VTCD) algorithm. To this end, we propose +an efficient approach for unsupervised identification of units of video +transformer representations - concepts, and ranking their importance to the +output of a model. The resulting concepts are highly interpretable, revealing +spatio-temporal reasoning mechanisms and object-centric representations in +unstructured video models. Performing this analysis jointly over a diverse set +of supervised and self-supervised representations, we discover that some of +these mechanism are universal in video transformers. Finally, we show that VTCD +can be used for fine-grained action recognition and video object segmentation. + +
+
+ comment: CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ Location-guided Head Pose Estimation for Fisheye Image + + +
+ Camera with a fisheye or ultra-wide lens covers a wide field of view that +cannot be modeled by the perspective projection. Serious fisheye lens +distortion in the peripheral region of the image leads to degraded performance +of the existing head pose estimation models trained on undistorted images. This +paper presents a new approach for head pose estimation that uses the knowledge +of head location in the image to reduce the negative effect of fisheye +distortion. We develop an end-to-end convolutional neural network to estimate +the head pose with the multi-task learning of head pose and head location. Our +proposed network estimates the head pose directly from the fisheye image +without the operation of rectification or calibration. We also created a +fisheye-distorted version of the three popular head pose estimation datasets, +BIWI, 300W-LP, and AFLW2000 for our experiments. Experiments results show that +our network remarkably improves the accuracy of head pose estimation compared +with other state-of-the-art one-stage and two-stage methods. + +
+
+ comment: Revised Introduction and Related Work; Submitted to lEEE Transactions + on Cognitive and Developmental Systems for review +
+
+
+
+
+ + ♻ ☆ VMamba: Visual State Space Model + + +
+ Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs) have long +been the predominant backbone networks for visual representation learning. +While ViTs have recently gained prominence over CNNs due to their superior +fitting capabilities, their scalability is largely constrained by the quadratic +complexity of attention computation. Inspired by the capability of Mamba in +efficiently modeling long sequences, we propose VMamba, a generic vision +backbone model aiming to reduce the computational complexity to linear while +retaining ViTs' advantageous features. To enhance VMamba's adaptability in +processing vision data, we introduce the Cross-Scan Module (CSM) to enable 1D +selective scanning in 2D image space with global receptive fields. +Additionally, we make further improvements in implementation details and +architectural designs to enhance VMamba's performance and boost its inference +speed. Extensive experimental results demonstrate VMamba's promising +performance across various visual perception tasks, highlighting its pronounced +advantages in input scaling efficiency compared to existing benchmark models. +Source code is available at https://github.com/MzeroMiko/VMamba. + +
+
+ comment: 21 pages, 12 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Data-Efficient Multimodal Fusion on a Single GPU CVPR 2024 + + +
+ The goal of multimodal alignment is to learn a single latent space that is +shared between multimodal inputs. The most powerful models in this space have +been trained using massive datasets of paired inputs and large-scale +computational resources, making them prohibitively expensive to train in many +practical scenarios. We surmise that existing unimodal encoders pre-trained on +large amounts of unimodal data should provide an effective bootstrap to create +multimodal models from unimodal ones at much lower costs. We therefore propose +FuseMix, a multimodal augmentation scheme that operates on the latent spaces of +arbitrary pre-trained unimodal encoders. Using FuseMix for multimodal +alignment, we achieve competitive performance -- and in certain cases +outperform state-of-the art methods -- in both image-text and audio-text +retrieval, with orders of magnitude less compute and data: for example, we +outperform CLIP on the Flickr30K text-to-image retrieval task with $\sim \! +600\times$ fewer GPU days and $\sim \! 80\times$ fewer image-text pairs. +Additionally, we show how our method can be applied to convert pre-trained +text-to-image generative models into audio-to-image ones. Code is available at: +https://github.com/layer6ai-labs/fusemix. + +
+
+ comment: CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ Building-road Collaborative Extraction from Remotely Sensed Images via + Cross-Interaction + + +
+ Buildings are the basic carrier of social production and human life; roads +are the links that interconnect social networks. Building and road information +has important application value in the frontier fields of regional coordinated +development, disaster prevention, auto-driving, etc. Mapping buildings and +roads from very high-resolution (VHR) remote sensing images have become a hot +research topic. However, the existing methods often ignore the strong spatial +correlation between roads and buildings and extract them in isolation. To fully +utilize the complementary advantages between buildings and roads, we propose a +building-road collaborative extraction method based on multi-task and +cross-scale feature interaction to improve the accuracy of both tasks in a +complementary way. A multi-task interaction module is proposed to interact +information across tasks and preserve the unique information of each task, +which tackle the seesaw phenomenon in multitask learning. By considering the +variation in appearance and structure between buildings and roads, a +cross-scale interaction module is designed to automatically learn the optimal +reception field for different tasks. Compared with many existing methods that +train each task individually, the proposed collaborative extraction method can +utilize the complementary advantages between buildings and roads by the +proposed inter-task and inter-scale feature interactions, and automatically +select the optimal reception field for different tasks. Experiments on a wide +range of urban and rural scenarios show that the proposed algorithm can achieve +building-road extraction with outstanding performance and efficiency. + +
+
+ comment: IEEE Transactions on Geoscience and Remote Sensing +
+
+
+
+
+ + ♻ ☆ BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics CVPR 2024 + + +
+ The recently emerging text-to-motion advances have spired numerous attempts +for convenient and interactive human motion generation. Yet, existing methods +are largely limited to generating body motions only without considering the +rich two-hand motions, let alone handling various conditions like body dynamics +or texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal +dataset for two-hand motion generation. Our dataset includes accurate motion +tracking for the human body and hands and provides pair-wised finger-level hand +annotations and body descriptions. We further provide a strong baseline method, +BOTH2Hands, for the novel task: generating vivid two-hand motions from both +implicit body dynamics and explicit text prompts. We first warm up two parallel +body-to-hand and text-to-hand diffusion models and then utilize the +cross-attention transformer for motion blending. Extensive experiments and +cross-validations demonstrate the effectiveness of our approach and dataset for +generating convincing two-hand motions from the hybrid body-and-textual +conditions. Our dataset and code will be disseminated to the community for +future research. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A + Conceptual Architecture + + +
+ This work proposes a WebXR-based cross-platform conceptual architecture, +leveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate +the development of an open, accessible, and interoperable metaverse. By +introducing the concept of spatial web app, this research contributes to the +discourse on the metaverse, offering an architecture that democratizes access +to virtual environments and extended reality through the web, and aligns with +Tim Berners-Lee's original vision of the World Wide Web as an open platform in +the digital realm. + +
+
+ comment: minor fixes (typos, URLs etc.) +
+
+
+
+
+ + ♻ ☆ Implicit Neural Representation for MRI Parallel Imaging Reconstruction + + +
+ Magnetic resonance imaging (MRI) usually faces lengthy acquisition times, +prompting the exploration of strategies such as parallel imaging (PI) to +alleviate this problem by periodically skipping specific K-space lines and +subsequently reconstructing high-quality images from the undersampled K-space. +Implicit neural representation (INR) has recently emerged as a promising deep +learning technique, characterizing objects as continuous functions of spatial +coordinates typically parameterized by a multilayer perceptron (MLP). In this +study, we propose a novel MRI PI reconstruction method that uses INR. Our +approach represents reconstructed fully-sampled images as functions of voxel +coordinates and prior feature vectors from undersampled images, addressing the +generalization challenges of INR. Specifically, we introduce a scale-embedded +encoder to generate scale-independent, voxel-specific features from MR images +across various undersampling scales. These features are then concatenated with +coordinate vectors to reconstruct fully-sampled MR images, facilitating +multiple-scale reconstructions. To evaluate our method's performance, we +conducted experiments using publicly available MRI datasets, comparing it with +alternative reconstruction techniques. Our quantitative assessment demonstrates +the superiority of our proposed method. + +
+
+
+
+
+ + ♻ ☆ Expediting Building Footprint Extraction from High-resolution Remote + Sensing Images via progressive lenient supervision + + +
+ The efficacy of building footprint segmentation from remotely sensed images +has been hindered by model transfer effectiveness. Many existing building +segmentation methods were developed upon the encoder-decoder architecture of +U-Net, in which the encoder is finetuned from the newly developed backbone +networks that are pre-trained on ImageNet. However, the heavy computational +burden of the existing decoder designs hampers the successful transfer of these +modern encoder networks to remote sensing tasks. Even the widely-adopted deep +supervision strategy fails to mitigate these challenges due to its invalid loss +in hybrid regions where foreground and background pixels are intermixed. In +this paper, we conduct a comprehensive evaluation of existing decoder network +designs for building footprint segmentation and propose an efficient framework +denoted as BFSeg to enhance learning efficiency and effectiveness. +Specifically, a densely-connected coarse-to-fine feature fusion decoder network +that facilitates easy and fast feature fusion across scales is proposed. +Moreover, considering the invalidity of hybrid regions in the down-sampled +ground truth during the deep supervision process, we present a lenient deep +supervision and distillation strategy that enables the network to learn proper +knowledge from deep supervision. Building upon these advancements, we have +developed a new family of building segmentation networks, which consistently +surpass prior works with outstanding performance and efficiency across a wide +range of newly developed encoder networks. + +
+
+
+
+
+ + ♻ ☆ Two-Phase Multi-Dose-Level PET Image Reconstruction with Dose Level + Awareness + + +
+ To obtain high-quality positron emission tomography (PET) while minimizing +radiation exposure, a range of methods have been designed to reconstruct +standard-dose PET (SPET) from corresponding low-dose PET (LPET) images. +However, most current methods merely learn the mapping between +single-dose-level LPET and SPET images, but omit the dose disparity of LPET +images in clinical scenarios. In this paper, to reconstruct high-quality SPET +images from multi-dose-level LPET images, we design a novel two-phase +multi-dose-level PET reconstruction algorithm with dose level awareness, +containing a pre-training phase and a SPET prediction phase. Specifically, the +pre-training phase is devised to explore both fine-grained discriminative +features and effective semantic representation. The SPET prediction phase +adopts a coarse prediction network utilizing pre-learned dose level prior to +generate preliminary result, and a refinement network to precisely preserve the +details. Experiments on MICCAI 2022 Ultra-low Dose PET Imaging Challenge +Dataset have demonstrated the superiority of our method. + +
+
+ comment: Accepted by ISBI2024 +
+
+
+
+
+ + ♻ ☆ Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for + Multi-exposure Image Fusion + + +
+ In recent years, deep learning networks have made remarkable strides in the +domain of multi-exposure image fusion. Nonetheless, prevailing approaches often +involve directly feeding over-exposed and under-exposed images into the +network, which leads to the under-utilization of inherent information present +in the source images. Additionally, unsupervised techniques predominantly +employ rudimentary weighted summation for color channel processing, culminating +in an overall desaturated final image tone. To partially mitigate these issues, +this study proposes a gamma correction module specifically designed to fully +leverage latent information embedded within source images. Furthermore, a +modified transformer block, embracing with self-attention mechanisms, is +introduced to optimize the fusion process. Ultimately, a novel color +enhancement algorithm is presented to augment image saturation while preserving +intricate details. The source code is available at +https://github.com/ZhiyingDu/BHFMEF. + +
+
+
+
+
+ + ♻ ☆ DREAM: Visual Decoding from Reversing Human Visual System + + +
+ In this work we present DREAM, an fMRI-to-image method for reconstructing +viewed images from brain activities, grounded on fundamental knowledge of the +human visual system. We craft reverse pathways that emulate the hierarchical +and parallel nature of how humans perceive the visual world. These tailored +pathways are specialized to decipher semantics, color, and depth cues from fMRI +data, mirroring the forward pathways from visual stimuli to fMRI recordings. To +do so, two components mimic the inverse processes within the human visual +system: the Reverse Visual Association Cortex (R-VAC) which reverses pathways +of this brain region, extracting semantics from fMRI data; the Reverse Parallel +PKM (R-PKM) component simultaneously predicting color and depth from fMRI +signals. The experiments indicate that our method outperforms the current +state-of-the-art models in terms of the consistency of appearance, structure, +and semantics. Code will be made publicly available to facilitate further +research in this field. + +
+
+ comment: Project Page: https://weihaox.github.io/DREAM +
+
+
+
+
+ + ♻ ☆ Pre-trained Model Guided Fine-Tuning for Zero-Shot Adversarial + Robustness CVPR 2024 + + +
+ Large-scale pre-trained vision-language models like CLIP have demonstrated +impressive performance across various tasks, and exhibit remarkable zero-shot +generalization capability, while they are also vulnerable to imperceptible +adversarial examples. Existing works typically employ adversarial training +(fine-tuning) as a defense method against adversarial examples. However, direct +application to the CLIP model may result in overfitting, compromising the +model's capacity for generalization. In this paper, we propose Pre-trained +Model Guided Adversarial Fine-Tuning (PMG-AFT) method, which leverages +supervision from the original pre-trained model by carefully designing an +auxiliary branch, to enhance the model's zero-shot adversarial robustness. +Specifically, PMG-AFT minimizes the distance between the features of +adversarial examples in the target model and those in the pre-trained model, +aiming to preserve the generalization features already captured by the +pre-trained model. Extensive Experiments on 15 zero-shot datasets demonstrate +that PMG-AFT significantly outperforms the state-of-the-art method, improving +the top-1 robust accuracy by an average of 4.99%. Furthermore, our approach +consistently improves clean accuracy by an average of 8.72%. Our code is +available at +https://github.com/serendipity1122/Pre-trained-Model-Guided-Fine-Tuning-for-Zero-Shot-Adversarial-Robustness. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ DG-TTA: Out-of-domain medical image segmentation through Domain + Generalization and Test-Time Adaptation + + +
+ Applying pre-trained medical segmentation models on out-of-domain images +often yields predictions of insufficient quality. Several strategies have been +proposed to maintain model performance, such as finetuning or unsupervised- and +source-free domain adaptation. These strategies set restrictive requirements +for data availability. In this study, we propose to combine domain +generalization and test-time adaptation to create a highly effective approach +for reusing pre-trained models in unseen target domains. Domain-generalized +pre-training on source data is used to obtain the best initial performance in +the target domain. We introduce the MIND descriptor previously used in image +registration tasks as a further technique to achieve generalization and present +superior performance for small-scale datasets compared to existing approaches. +At test-time, high-quality segmentation for every single unseen scan is ensured +by optimizing the model weights for consistency given different image +augmentations. That way, our method enables separate use of source and target +data and thus removes current data availability barriers. Moreover, the +presented method is highly modular as it does not require specific model +architectures or prior knowledge of involved domains and labels. We demonstrate +this by integrating it into the nnUNet, which is currently the most popular and +accurate framework for medical image segmentation. We employ multiple datasets +covering abdominal, cardiac, and lumbar spine scans and compose several +out-of-domain scenarios in this study. We demonstrate that our method, combined +with pre-trained whole-body CT models, can effectively segment MR images with +high accuracy in all of the aforementioned scenarios. Open-source code can be +found here: https://github.com/multimodallearning/DG-TTA + +
+
+
+
+
+ + ♻ ☆ ExpPoint-MAE: Better interpretability and performance for + self-supervised point cloud transformers + + +
+ In this paper we delve into the properties of transformers, attained through +self-supervision, in the point cloud domain. Specifically, we evaluate the +effectiveness of Masked Autoencoding as a pretraining scheme, and explore +Momentum Contrast as an alternative. In our study we investigate the impact of +data quantity on the learned features, and uncover similarities in the +transformer's behavior across domains. Through comprehensive visualiations, we +observe that the transformer learns to attend to semantically meaningful +regions, indicating that pretraining leads to a better understanding of the +underlying geometry. Moreover, we examine the finetuning process and its effect +on the learned representations. Based on that, we devise an unfreezing strategy +which consistently outperforms our baseline without introducing any other +modifications to the model or the training pipeline, and achieve +state-of-the-art results in the classification task among transformer models. + +
+
+
+
+
+ + ♻ ☆ AGILE3D: Attention Guided Interactive Multi-object 3D Segmentation ICLR 2024 + + +
+ During interactive segmentation, a model and a user work together to +delineate objects of interest in a 3D point cloud. In an iterative process, the +model assigns each data point to an object (or the background), while the user +corrects errors in the resulting segmentation and feeds them back into the +model. The current best practice formulates the problem as binary +classification and segments objects one at a time. The model expects the user +to provide positive clicks to indicate regions wrongly assigned to the +background and negative clicks on regions wrongly assigned to the object. +Sequentially visiting objects is wasteful since it disregards synergies between +objects: a positive click for a given object can, by definition, serve as a +negative click for nearby objects. Moreover, a direct competition between +adjacent objects can speed up the identification of their common boundary. We +introduce AGILE3D, an efficient, attention-based model that (1) supports +simultaneous segmentation of multiple 3D objects, (2) yields more accurate +segmentation masks with fewer user clicks, and (3) offers faster inference. Our +core idea is to encode user clicks as spatial-temporal queries and enable +explicit interactions between click queries as well as between them and the 3D +scene through a click attention module. Every time new clicks are added, we +only need to run a lightweight decoder that produces updated segmentation +masks. In experiments with four different 3D point cloud datasets, AGILE3D sets +a new state-of-the-art. Moreover, we also verify its practicality in real-world +setups with real user studies. + +
+
+ comment: ICLR 2024 camera-ready. Project page: https://ywyue.github.io/AGILE3D +
+
+
+
+
+ + ♻ ☆ Physics-guided Shape-from-Template: Monocular Video Perception through + Neural Surrogate Models + + +
+ 3D reconstruction of dynamic scenes is a long-standing problem in computer +graphics and increasingly difficult the less information is available. +Shape-from-Template (SfT) methods aim to reconstruct a template-based geometry +from RGB images or video sequences, often leveraging just a single monocular +camera without depth information, such as regular smartphone recordings. +Unfortunately, existing reconstruction methods are either unphysical and noisy +or slow in optimization. To solve this problem, we propose a novel SfT +reconstruction algorithm for cloth using a pre-trained neural surrogate model +that is fast to evaluate, stable, and produces smooth reconstructions due to a +regularizing physics simulation. Differentiable rendering of the simulated mesh +enables pixel-wise comparisons between the reconstruction and a target video +sequence that can be used for a gradient-based optimization procedure to +extract not only shape information but also physical parameters such as +stretching, shearing, or bending stiffness of the cloth. This allows to retain +a precise, stable, and smooth reconstructed geometry while reducing the runtime +by a factor of 400-500 compared to $\phi$-SfT, a state-of-the-art physics-based +SfT approach. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Denoising for Signal-Dependent and Row-Correlated Imaging + Noise + + +
+ Accurate analysis of microscopy images is hindered by the presence of noise. +This noise is usually signal-dependent and often additionally correlated along +rows or columns of pixels. Current self- and unsupervised denoisers can address +signal-dependent noise, but none can reliably remove noise that is also row- or +column-correlated. Here, we present the first fully unsupervised deep +learning-based denoiser capable of handling imaging noise that is +row-correlated as well as signal-dependent. Our approach uses a Variational +Autoencoder (VAE) with a specially designed autoregressive decoder. This +decoder is capable of modeling row-correlated and signal-dependent noise but is +incapable of independently modeling underlying clean signal. The VAE therefore +produces latent variables containing only clean signal information, and these +are mapped back into image space using a proposed second decoder network. Our +method does not require a pre-trained noise model and can be trained from +scratch using unpaired noisy data. We show that our approach achieves +competitive results when applied to a range of different sensor types and +imaging modalities. + +
+
+
+
+
+ + ♻ ☆ Triple-CFN: Restructuring Conceptual Spaces for Enhancing Abstract + Reasoning process + + +
+ Abstract reasoning problems pose significant challenges to artificial +intelligence algorithms, demanding cognitive capabilities beyond those required +for perception tasks. This study introduces the Triple-CFN approach to tackle +the Bongard-Logo problem, achieving notable reasoning accuracy by implicitly +reorganizing the concept space of conflicting instances. Additionally, the +Triple-CFN paradigm proves effective for the RPM problem with necessary +modifications, yielding competitive results. To further enhance performance on +the RPM issue, we develop the Meta Triple-CFN network, which explicitly +structures the problem space while maintaining interpretability on progressive +patterns. The success of Meta Triple-CFN is attributed to its paradigm of +modeling the conceptual space, equivalent to normalizing reasoning information. +Based on this ideology, we introduce the Re-space layer, enhancing the +performance of both Meta Triple-CFN and Triple-CFN. This paper aims to +contribute to advancements in machine intelligence by exploring innovative +network designs for addressing abstract reasoning problems, paving the way for +further breakthroughs in this domain. + +
+
+ comment: 14 pages, 14 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Large-scale Multi-Modal Pre-trained Models: A Comprehensive Survey + + +
+ With the urgent demand for generalized deep models, many pre-trained big +models are proposed, such as BERT, ViT, GPT, etc. Inspired by the success of +these models in single domains (like computer vision and natural language +processing), the multi-modal pre-trained big models have also drawn more and +more attention in recent years. In this work, we give a comprehensive survey of +these models and hope this paper could provide new insights and helps fresh +researchers to track the most cutting-edge works. Specifically, we firstly +introduce the background of multi-modal pre-training by reviewing the +conventional deep learning, pre-training works in natural language process, +computer vision, and speech. Then, we introduce the task definition, key +challenges, and advantages of multi-modal pre-training models (MM-PTMs), and +discuss the MM-PTMs with a focus on data, objectives, network architectures, +and knowledge enhanced pre-training. After that, we introduce the downstream +tasks used for the validation of large-scale MM-PTMs, including generative, +classification, and regression tasks. We also give visualization and analysis +of the model parameters and results on representative downstream tasks. +Finally, we point out possible research directions for this topic that may +benefit future works. In addition, we maintain a continuously updated paper +list for large-scale pre-trained multi-modal big models: +https://github.com/wangxiao5791509/MultiModal_BigModels_Survey. This paper has +been published by the journal Machine Intelligence Research (MIR), +https://link.springer.com/article/10.1007/s11633-022-1410-8, DOI: +10.1007/s11633-022-1410-8, vol. 20, no. 4, pp. 447-482, 2023. + +
+
+ comment: Accepted by Machine Intelligence Research (MIR) +
+
+
+
+
+ + ♻ ☆ MixedNUTS: Training-Free Accuracy-Robustness Balance via Nonlinearly + Mixed Classifiers + + +
+ Adversarial robustness often comes at the cost of degraded accuracy, impeding +the real-life application of robust classification models. Training-based +solutions for better trade-offs are limited by incompatibilities with +already-trained high-performance large models, necessitating the exploration of +training-free ensemble approaches. Observing that robust models are more +confident in correct predictions than in incorrect ones on clean and +adversarial data alike, we speculate amplifying this "benign confidence +property" can reconcile accuracy and robustness in an ensemble setting. To +achieve so, we propose "MixedNUTS", a training-free method where the output +logits of a robust classifier and a standard non-robust classifier are +processed by nonlinear transformations with only three parameters, which are +optimized through an efficient algorithm. MixedNUTS then converts the +transformed logits into probabilities and mixes them as the overall output. On +CIFAR-10, CIFAR-100, and ImageNet datasets, experimental results with custom +strong adaptive attacks demonstrate MixedNUTS's vastly improved accuracy and +near-SOTA robustness -- it boosts CIFAR-100 clean accuracy by 7.86 points, +sacrificing merely 0.87 points in robust accuracy. + +
+
+
+
+
+ + ♻ ☆ RS-Mamba for Large Remote Sensing Image Dense Prediction + + +
+ Context modeling is critical for remote sensing image dense prediction tasks. +Nowadays, the growing size of very-high-resolution (VHR) remote sensing images +poses challenges in effectively modeling context. While transformer-based +models possess global modeling capabilities, they encounter computational +challenges when applied to large VHR images due to their quadratic complexity. +The conventional practice of cropping large images into smaller patches results +in a notable loss of contextual information. To address these issues, we +propose the Remote Sensing Mamba (RSM) for dense prediction tasks in large VHR +remote sensing images. RSM is specifically designed to capture the global +context of remote sensing images with linear complexity, facilitating the +effective processing of large VHR images. Considering that the land covers in +remote sensing images are distributed in arbitrary spatial directions due to +characteristics of remote sensing over-head imaging, the RSM incorporates an +omnidirectional selective scan module to globally model the context of images +in multiple directions, capturing large spatial features from various +directions. Extensive experiments on semantic segmentation and change detection +tasks across various land covers demonstrate the effectiveness of the proposed +RSM. We designed simple yet effective models based on RSM, achieving +state-of-the-art performance on dense prediction tasks in VHR remote sensing +images without fancy training strategies. Leveraging the linear complexity and +global modeling capabilities, RSM achieves better efficiency and accuracy than +transformer-based models on large remote sensing images. Interestingly, we also +demonstrated that our model generally performs better with a larger image size +on dense prediction tasks. Our code is available at +https://github.com/walking-shadow/Official_Remote_Sensing_Mamba. + +
+
+ comment: 15 pages,8 figures +
+
+
+
+
+ + ♻ ☆ Improving the Generalization of Segmentation Foundation Model under + Distribution Shift via Weakly Supervised Adaptation + + +
+ The success of large language models has inspired the computer vision +community to explore image segmentation foundation model that is able to +zero/few-shot generalize through prompt engineering. Segment-Anything(SAM), +among others, is the state-of-the-art image segmentation foundation model +demonstrating strong zero/few-shot generalization. Despite the success, recent +studies reveal the weakness of SAM under strong distribution shift. In +particular, SAM performs awkwardly on corrupted natural images, camouflaged +images, medical images, etc. Motivated by the observations, we aim to develop a +self-training based strategy to adapt SAM to target distribution. Given the +unique challenges of large source dataset, high computation cost and incorrect +pseudo label, we propose a weakly supervised self-training architecture with +anchor regularization and low-rank finetuning to improve the robustness and +computation efficiency of adaptation. We validate the effectiveness on 5 types +of downstream segmentation tasks including natural clean/corrupted images, +medical images, camouflaged images and robotic images. Our proposed method is +task-agnostic in nature and outperforms pre-trained SAM and state-of-the-art +domain adaptation methods on almost all downstream tasks with the same testing +prompt inputs. + +
+
+ comment: 20 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Ear-Keeper: Real-time Diagnosis of Ear Lesions Utilizing + Ultralight-Ultrafast ConvNet and Large-scale Ear Endoscopic Dataset + + +
+ Deep learning-based ear disease diagnosis technology has proven effective and +affordable. However, due to the lack of ear endoscope datasets with diversity, +the practical potential of the deep learning model has not been thoroughly +studied. Moreover, existing research failed to achieve a good trade-off between +model inference speed and parameter size, rendering models inapplicable in +real-world settings. To address these challenges, we constructed the first +large-scale ear endoscopic dataset comprising eight types of ear diseases and +disease-free samples from two institutions. Inspired by ShuffleNetV2, we +proposed Best-EarNet, an ultrafast and ultralight network enabling real-time +ear disease diagnosis. Best-EarNet incorporates a novel Local-Global Spatial +Feature Fusion Module and multi-scale supervision strategy, which facilitates +the model focusing on global-local information within feature maps at various +levels. Utilizing transfer learning, the accuracy of Best-EarNet with only +0.77M parameters achieves 95.23% (internal 22,581 images) and 92.14% (external +1,652 images), respectively. In particular, it achieves an average frame per +second of 80 on the CPU. From the perspective of model practicality, the +proposed Best-EarNet is superior to state-of-the-art backbone models in ear +lesion detection tasks. Most importantly, Ear-keeper, an intelligent diagnosis +system based Best-EarNet, was developed successfully and deployed on common +electronic devices (smartphone, tablet computer and personal computer). In the +future, Ear-Keeper has the potential to assist the public and healthcare +providers in performing comprehensive scanning and diagnosis of the ear canal +in real-time video, thereby promptly detecting ear lesions. + +
+
+ comment: 18 pages,8 figures +
+
+
+
+
+ + ♻ ☆ GPT as Psychologist? Preliminary Evaluations for GPT-4V on Visual + Affective Computing + + +
+ Multimodal large language models (MLLMs) are designed to process and +integrate information from multiple sources, such as text, speech, images, and +videos. Despite its success in language understanding, it is critical to +evaluate the performance of downstream tasks for better human-centric +applications. This paper assesses the application of MLLMs with 5 crucial +abilities for affective computing, spanning from visual affective tasks and +reasoning tasks. The results show that \gpt has high accuracy in facial action +unit recognition and micro-expression detection while its general facial +expression recognition performance is not accurate. We also highlight the +challenges of achieving fine-grained micro-expression recognition and the +potential for further study and demonstrate the versatility and potential of +\gpt for handling advanced tasks in emotion recognition and related fields by +integrating with task-related agents for more complex tasks, such as heart rate +estimation through signal processing. In conclusion, this paper provides +valuable insights into the potential applications and challenges of MLLMs in +human-centric computing. Our interesting examples are at +https://github.com/EnVision-Research/GPT4Affectivity. + +
+
+
+
+
+ + ♻ ☆ GaussianImage: 1000 FPS Image Representation and Compression by 2D + Gaussian Splatting + + +
+ Implicit neural representations (INRs) recently achieved great success in +image representation and compression, offering high visual quality and fast +rendering speeds with 10-1000 FPS, assuming sufficient GPU resources are +available. However, this requirement often hinders their use on low-end devices +with limited memory. In response, we propose a groundbreaking paradigm of image +representation and compression by 2D Gaussian Splatting, named GaussianImage. +We first introduce 2D Gaussian to represent the image, where each Gaussian has +8 parameters including position, covariance and color. Subsequently, we unveil +a novel rendering algorithm based on accumulated summation. Remarkably, our +method with a minimum of 3$\times$ lower GPU memory usage and 5$\times$ faster +fitting time not only rivals INRs (e.g., WIRE, I-NGP) in representation +performance, but also delivers a faster rendering speed of 1500-2000 FPS +regardless of parameter size. Furthermore, we integrate existing vector +quantization technique to build an image codec. Experimental results +demonstrate that our codec attains rate-distortion performance comparable to +compression-based INRs such as COIN and COIN++, while facilitating decoding +speeds of approximately 1000 FPS. Additionally, preliminary proof of concept +shows that our codec surpasses COIN and COIN++ in performance when using +partial bits-back coding. Code will be available at +https://github.com/Xinjie-Q/GaussianImage. + +
+
+
+
+
+ + ♻ ☆ Re-DiffiNet: Modeling discrepancies in tumor segmentation using + diffusion models + + +
+ Identification of tumor margins is essential for surgical decision-making for +glioblastoma patients and provides reliable assistance for neurosurgeons. +Despite improvements in deep learning architectures for tumor segmentation over +the years, creating a fully autonomous system suitable for clinical floors +remains a formidable challenge because the model predictions have not yet +reached the desired level of accuracy and generalizability for clinical +applications. Generative modeling techniques have seen significant improvements +in recent times. Specifically, Generative Adversarial Networks (GANs) and +Denoising-diffusion-based models (DDPMs) have been used to generate +higher-quality images with fewer artifacts and finer attributes. In this work, +we introduce a framework called Re-Diffinet for modeling the discrepancy +between the outputs of a segmentation model like U-Net and the ground truth, +using DDPMs. By explicitly modeling the discrepancy, the results show an +average improvement of 0.55\% in the Dice score and 16.28\% in HD95 from +cross-validation over 5-folds, compared to the state-of-the-art U-Net +segmentation model. + +
+
+
+
+
+ + ♻ ☆ AUEditNet: Dual-Branch Facial Action Unit Intensity Manipulation with + Implicit Disentanglement + + +
+ Facial action unit (AU) intensity plays a pivotal role in quantifying +fine-grained expression behaviors, which is an effective condition for facial +expression manipulation. However, publicly available datasets containing +intensity annotations for multiple AUs remain severely limited, often featuring +a restricted number of subjects. This limitation places challenges to the AU +intensity manipulation in images due to disentanglement issues, leading +researchers to resort to other large datasets with pretrained AU intensity +estimators for pseudo labels. In addressing this constraint and fully +leveraging manual annotations of AU intensities for precise manipulation, we +introduce AUEditNet. Our proposed model achieves impressive intensity +manipulation across 12 AUs, trained effectively with only 18 subjects. +Utilizing a dual-branch architecture, our approach achieves comprehensive +disentanglement of facial attributes and identity without necessitating +additional loss functions or implementing with large batch sizes. This approach +offers a potential solution to achieve desired facial attribute editing despite +the dataset's limited subject count. Our experiments demonstrate AUEditNet's +superior accuracy in editing AU intensities, affirming its capability in +disentangling facial attributes and identity within a limited subject pool. +AUEditNet allows conditioning by either intensity values or target images, +eliminating the need for constructing AU combinations for specific facial +expression synthesis. Moreover, AU intensity estimation, as a downstream task, +validates the consistency between real and edited images, confirming the +effectiveness of our proposed AU intensity manipulation method. + +
+
+
+
+
+ + ♻ ☆ Ultra-Range Gesture Recognition using a Web-Camera in Human-Robot + Interaction + + +
+ Hand gestures play a significant role in human interactions where non-verbal +intentions, thoughts and commands are conveyed. In Human-Robot Interaction +(HRI), hand gestures offer a similar and efficient medium for conveying clear +and rapid directives to a robotic agent. However, state-of-the-art vision-based +methods for gesture recognition have been shown to be effective only up to a +user-camera distance of seven meters. Such a short distance range limits +practical HRI with, for example, service robots, search and rescue robots and +drones. In this work, we address the Ultra-Range Gesture Recognition (URGR) +problem by aiming for a recognition distance of up to 25 meters and in the +context of HRI. We propose the URGR framework, a novel deep-learning, using +solely a simple RGB camera. Gesture inference is based on a single image. +First, a novel super-resolution model termed High-Quality Network (HQ-Net) uses +a set of self-attention and convolutional layers to enhance the low-resolution +image of the user. Then, we propose a novel URGR classifier termed Graph Vision +Transformer (GViT) which takes the enhanced image as input. GViT combines the +benefits of a Graph Convolutional Network (GCN) and a modified Vision +Transformer (ViT). Evaluation of the proposed framework over diverse test data +yields a high recognition rate of 98.1%. The framework has also exhibited +superior performance compared to human recognition in ultra-range distances. +With the framework, we analyze and demonstrate the performance of an autonomous +quadruped robot directed by human gestures in complex ultra-range indoor and +outdoor environments, acquiring 96% recognition rate on average. + +
+
+ comment: Engineering Applications of Artificial Intelligence, In press +
+
+
+
+
+ + ♻ ☆ Discovering Closed-Loop Failures of Vision-Based Controllers via + Reachability Analysis + + +
+ Machine learning driven image-based controllers allow robotic systems to take +intelligent actions based on the visual feedback from their environment. +Understanding when these controllers might lead to system safety violations is +important for their integration in safety-critical applications and engineering +corrective safety measures for the system. Existing methods leverage +simulation-based testing (or falsification) to find the failures of +vision-based controllers, i.e., the visual inputs that lead to closed-loop +safety violations. However, these techniques do not scale well to the scenarios +involving high-dimensional and complex visual inputs, such as RGB images. In +this work, we cast the problem of finding closed-loop vision failures as a +Hamilton-Jacobi (HJ) reachability problem. Our approach blends simulation-based +analysis with HJ reachability methods to compute an approximation of the +backward reachable tube (BRT) of the system, i.e., the set of unsafe states for +the system under vision-based controllers. Utilizing the BRT, we can tractably +and systematically find the system states and corresponding visual inputs that +lead to closed-loop failures. These visual inputs can be subsequently analyzed +to find the input characteristics that might have caused the failure. Besides +its scalability to high-dimensional visual inputs, an explicit computation of +BRT allows the proposed approach to capture non-trivial system failures that +are difficult to expose via random simulations. We demonstrate our framework on +two case studies involving an RGB image-based neural network controller for (a) +autonomous indoor navigation, and (b) autonomous aircraft taxiing. + +
+
+
+
+
+ + ♻ ☆ nnMobileNe: Rethinking CNN for Retinopathy Research CVPR + + +
+ Over the past few decades, convolutional neural networks (CNNs) have been at +the forefront of the detection and tracking of various retinal diseases (RD). +Despite their success, the emergence of vision transformers (ViT) in the 2020s +has shifted the trajectory of RD model development. The leading-edge +performance of ViT-based models in RD can be largely credited to their +scalability-their ability to improve as more parameters are added. As a result, +ViT-based models tend to outshine traditional CNNs in RD applications, albeit +at the cost of increased data and computational demands. ViTs also differ from +CNNs in their approach to processing images, working with patches rather than +local regions, which can complicate the precise localization of small, variably +presented lesions in RD. In our study, we revisited and updated the +architecture of a CNN model, specifically MobileNet, to enhance its utility in +RD diagnostics. We found that an optimized MobileNet, through selective +modifications, can surpass ViT-based models in various RD benchmarks, including +diabetic retinopathy grading, detection of multiple fundus diseases, and +classification of diabetic macular edema. The code is available at +https://github.com/Retinal-Research/NN-MOBILENET + +
+
+ comment: Accepted as a conference paper to 2024 CVPRW +
+
+
+
+
+ + ♻ ☆ LongVLM: Efficient Long Video Understanding via Large Language Models + + +
+ Empowered by Large Language Models (LLMs), recent advancements in VideoLLMs +have driven progress in various video understanding tasks. These models encode +video representations through pooling or query aggregation over a vast number +of visual tokens, making computational and memory costs affordable. Despite +successfully providing an overall comprehension of video content, existing +VideoLLMs still face challenges in achieving detailed understanding in videos +due to overlooking local information in long-term videos. To tackle this +challenge, we introduce LongVLM, a straightforward yet powerful VideoLLM for +long video understanding, building upon the observation that long videos often +consist of sequential key events, complex actions, and camera movements. Our +approach proposes to decompose long videos into multiple short-term segments +and encode local features for each local segment via a hierarchical token +merging module. These features are concatenated in temporal order to maintain +the storyline across sequential short-term segments. Additionally, we propose +to integrate global semantics into each local feature to enhance context +understanding. In this way, we encode video representations that incorporate +both local and global information, enabling the LLM to generate comprehensive +responses for long-term videos. Experimental results on the VideoChatGPT +benchmark and zero-shot video question-answering datasets demonstrate the +superior capabilities of our model over the previous state-of-the-art methods. +Qualitative examples demonstrate that our model produces more precise responses +for long videos understanding. Code will be available at +https://github.com/ziplab/LongVLM. + +
+
+
+
+
+ + ♻ ☆ GraphBEV: Towards Robust BEV Feature Alignment for Multi-Modal 3D Object + Detection + + +
+ Integrating LiDAR and camera information into Bird's-Eye-View (BEV) +representation has emerged as a crucial aspect of 3D object detection in +autonomous driving. However, existing methods are susceptible to the inaccurate +calibration relationship between LiDAR and the camera sensor. Such inaccuracies +result in errors in depth estimation for the camera branch, ultimately causing +misalignment between LiDAR and camera BEV features. In this work, we propose a +robust fusion framework called Graph BEV. Addressing errors caused by +inaccurate point cloud projection, we introduce a Local Align module that +employs neighbor-aware depth features via Graph matching. Additionally, we +propose a Global Align module to rectify the misalignment between LiDAR and +camera BEV features. Our Graph BEV framework achieves state-of-the-art +performance, with an mAP of 70.1\%, surpassing BEV Fusion by 1.6\% on the +nuscenes validation set. Importantly, our Graph BEV outperforms BEV Fusion by +8.3\% under conditions with misalignment noise. + +
+
+
+
+
+ + ♻ ☆ Exploring the Potential of Large Foundation Models for Open-Vocabulary + HOI Detection + + +
+ Open-vocabulary human-object interaction (HOI) detection, which is concerned +with the problem of detecting novel HOIs guided by natural language, is crucial +for understanding human-centric scenes. However, prior zero-shot HOI detectors +often employ the same levels of feature maps to model HOIs with varying +distances, leading to suboptimal performance in scenes containing human-object +pairs with a wide range of distances. In addition, these detectors primarily +rely on category names and overlook the rich contextual information that +language can provide, which is essential for capturing open vocabulary concepts +that are typically rare and not well-represented by category names alone. In +this paper, we introduce a novel end-to-end open vocabulary HOI detection +framework with conditional multi-level decoding and fine-grained semantic +enhancement (CMD-SE), harnessing the potential of Visual-Language Models +(VLMs). Specifically, we propose to model human-object pairs with different +distances with different levels of feature maps by incorporating a soft +constraint during the bipartite matching process. Furthermore, by leveraging +large language models (LLMs) such as GPT models, we exploit their extensive +world knowledge to generate descriptions of human body part states for various +interactions. Then we integrate the generalizable and fine-grained semantics of +human body parts to improve interaction recognition. Experimental results on +two datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method +achieves state-of-the-art results in open vocabulary HOI detection. The code +and models are available at https://github.com/ltttpku/CMD-SE-release. + +
+
+
+
+
+ + ♻ ☆ Towards Enhanced Analysis of Lung Cancer Lesions in EBUS-TBNA -- A + Semi-Supervised Video Object Detection Method + + +
+ This study aims to establish a computer-aided diagnostic system for lung +lesions using bronchoscope endobronchial ultrasound (EBUS) to assist physicians +in identifying lesion areas. During EBUS-transbronchial needle aspiration +(EBUS-TBNA) procedures, physicians rely on grayscale ultrasound images to +determine the location of lesions. However, these images often contain +significant noise and can be influenced by surrounding tissues or blood +vessels, making interpretation challenging. Previous research has lacked the +application of object detection models to EBUS-TBNA, and there has been no +well-defined solution for annotating the EBUS-TBNA dataset. In related studies +on ultrasound images, although models have been successful in capturing target +regions for their respective tasks, their training and predictions have been +based on two-dimensional images, limiting their ability to leverage temporal +features for improved predictions. This study introduces a three-dimensional +image-based object detection model. It utilizes an attention mechanism to +capture temporal correlations and we will implements a filtering mechanism to +select relevant information from previous frames. Subsequently, a +teacher-student model training approach is employed to optimize the model +further, leveraging unlabeled data. To mitigate the impact of poor-quality +pseudo-labels on the student model, we will add a special Gaussian Mixture +Model (GMM) to ensure the quality of pseudo-labels. + +
+
+
+
+
+ + ♻ ☆ Using Few-Shot Learning to Classify Primary Lung Cancer and Other + Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial + Ultrasound Procedures + + +
+ This study aims to establish a computer-aided diagnosis system for +endobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary +diagnosis of metastatic cancer. This involves arranging immediate examinations +for other sites of metastatic cancer after EBUS surgery, eliminating the need +to wait for reports, thereby shortening the waiting time by more than half and +enabling patients to detect other cancers earlier, allowing for early planning +and implementation of treatment plans. Unlike previous studies on cell image +classification, which have abundant datasets for training, this study must also +be able to make effective classifications despite the limited amount of case +data for lung metastatic cancer. In the realm of small data set classification +methods, Few-shot learning (FSL) has become mainstream in recent years. Through +its ability to train on small datasets and its strong generalization +capabilities, FSL shows potential in this task of lung metastatic cell image +classification. This study will adopt the approach of Few-shot learning, +referencing existing proposed models, and designing a model architecture for +classifying lung metastases cell images. Batch Spectral Regularization (BSR) +will be incorporated as a loss update parameter, and the Finetune method of PMF +will be modified. In terms of test results, the addition of BSR and the +modified Finetune method further increases the accuracy by 8.89% to 65.60%, +outperforming other FSL methods. This study confirms that FSL is superior to +supervised and transfer learning in classifying metastatic cancer and +demonstrates that using BSR as a loss function and modifying Finetune can +enhance the model's capabilities. + +
+
+
+
+
+ + ♻ ☆ Pyramid Deep Fusion Network for Two-Hand Reconstruction from RGB-D + Images + + +
+ Accurately recovering the dense 3D mesh of both hands from monocular images +poses considerable challenges due to occlusions and projection ambiguity. Most +of the existing methods extract features from color images to estimate the +root-aligned hand meshes, which neglect the crucial depth and scale information +in the real world. Given the noisy sensor measurements with limited resolution, +depth-based methods predict 3D keypoints rather than a dense mesh. These +limitations motivate us to take advantage of these two complementary inputs to +acquire dense hand meshes on a real-world scale. In this work, we propose an +end-to-end framework for recovering dense meshes for both hands, which employ +single-view RGB-D image pairs as input. The primary challenge lies in +effectively utilizing two different input modalities to mitigate the blurring +effects in RGB images and noises in depth images. Instead of directly treating +depth maps as additional channels for RGB images, we encode the depth +information into the unordered point cloud to preserve more geometric details. +Specifically, our framework employs ResNet50 and PointNet++ to derive features +from RGB and point cloud, respectively. Additionally, we introduce a novel +pyramid deep fusion network (PDFNet) to aggregate features at different scales, +which demonstrates superior efficacy compared to previous fusion strategies. +Furthermore, we employ a GCN-based decoder to process the fused features and +recover the corresponding 3D pose and dense mesh. Through comprehensive +ablation experiments, we have not only demonstrated the effectiveness of our +proposed fusion algorithm but also outperformed the state-of-the-art approaches +on publicly available datasets. To reproduce the results, we will make our +source code and models publicly available at +{https://github.com/zijinxuxu/PDFNet}. + +
+
+ comment: Accepted by TCSVT +
+
+
+
+
+ + ♻ ☆ CitDet: A Benchmark Dataset for Citrus Fruit Detection RA-L + + +
+ In this letter, we present a new dataset to advance the state of the art in +detecting citrus fruit and accurately estimate yield on trees affected by the +Huanglongbing (HLB) disease in orchard environments via imaging. Despite the +fact that significant progress has been made in solving the fruit detection +problem, the lack of publicly available datasets has complicated direct +comparison of results. For instance, citrus detection has long been of interest +to the agricultural research community, yet there is an absence of work, +particularly involving public datasets of citrus affected by HLB. To address +this issue, we enhance state-of-the-art object detection methods for use in +typical orchard settings. Concretely, we provide high-resolution images of +citrus trees located in an area known to be highly affected by HLB, along with +high-quality bounding box annotations of citrus fruit. Fruit on both the trees +and the ground are labeled to allow for identification of fruit location, which +contributes to advancements in yield estimation and potential measure of HLB +impact via fruit drop. The dataset consists of over 32,000 bounding box +annotations for fruit instances contained in 579 high-resolution images. In +summary, our contributions are the following: (i) we introduce a novel dataset +along with baseline performance benchmarks on multiple contemporary object +detection algorithms, (ii) we show the ability to accurately capture fruit +location on tree or on ground, and finally (ii) we present a correlation of our +results with yield estimations. + +
+
+ comment: Submitted to IEEE Robotics and Automation Letters (RA-L) +
+
+
+
+
+ + ♻ ☆ A Generic Shared Attention Mechanism for Various Backbone Neural + Networks + + +
+ The self-attention mechanism has emerged as a critical component for +improving the performance of various backbone neural networks. However, current +mainstream approaches individually incorporate newly designed self-attention +modules (SAMs) into each layer of the network for granted without fully +exploiting their parameters' potential. This leads to suboptimal performance +and increased parameter consumption as the network depth increases. To improve +this paradigm, in this paper, we first present a counterintuitive but inherent +phenomenon: SAMs tend to produce strongly correlated attention maps across +different layers, with an average Pearson correlation coefficient of up to +0.85. Inspired by this inherent observation, we propose Dense-and-Implicit +Attention (DIA), which directly shares SAMs across layers and employs a long +short-term memory module to calibrate and bridge the highly correlated +attention maps of different layers, thus improving the parameter utilization +efficiency of SAMs. This design of DIA is also consistent with the neural +network's dynamical system perspective. Through extensive experiments, we +demonstrate that our simple yet effective DIA can consistently enhance various +network backbones, including ResNet, Transformer, and UNet, across tasks such +as image classification, object detection, and image generation using diffusion +models. + +
+
+ comment: Work in progress. arXiv admin note: text overlap with + arXiv:1905.10671 +
+
+
+
+
+ + ♻ ☆ Flying with Photons: Rendering Novel Views of Propagating Light + + +
+ We present an imaging and neural rendering technique that seeks to synthesize +videos of light propagating through a scene from novel, moving camera +viewpoints. Our approach relies on a new ultrafast imaging setup to capture a +first-of-its kind, multi-viewpoint video dataset with picosecond-level temporal +resolution. Combined with this dataset, we introduce an efficient neural volume +rendering framework based on the transient field. This field is defined as a +mapping from a 3D point and 2D direction to a high-dimensional, discrete-time +signal that represents time-varying radiance at ultrafast timescales. Rendering +with transient fields naturally accounts for effects due to the finite speed of +light, including viewpoint-dependent appearance changes caused by light +propagation delays to the camera. We render a range of complex effects, +including scattering, specular reflection, refraction, and diffraction. +Additionally, we demonstrate removing viewpoint-dependent propagation delays +using a time warping procedure, rendering of relativistic effects, and video +synthesis of direct and global components of light transport. + +
+
+ comment: Project page: https://anaghmalik.com/FlyingWithPhotons/ +
+
+
+
+
+ + ♻ ☆ Reconstructing Hand-Held Objects in 3D + + +
+ Objects manipulated by the hand (i.e., manipulanda) are particularly +challenging to reconstruct from in-the-wild RGB images or videos. Not only does +the hand occlude much of the object, but also the object is often only visible +in a small number of image pixels. At the same time, two strong anchors emerge +in this setting: (1) estimated 3D hands help disambiguate the location and +scale of the object, and (2) the set of manipulanda is small relative to all +possible objects. With these insights in mind, we present a scalable paradigm +for handheld object reconstruction that builds on recent breakthroughs in large +language/vision models and 3D object datasets. Our model, MCC-Hand-Object +(MCC-HO), jointly reconstructs hand and object geometry given a single RGB +image and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve +a 3D object model that matches the object in the image and rigidly align the +model to the network-inferred geometry; we call this alignment +Retrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO +achieves state-of-the-art performance on lab and Internet datasets, and we show +how RAR can be used to automatically obtain 3D labels for in-the-wild images of +hand-object interactions. + +
+
+ comment: Project page: https://janehwu.github.io/mcc-ho +
+
+
+
+
+ + ♻ ☆ Phase Guided Light Field for Spatial-Depth High Resolution 3D Imaging + + +
+ On 3D imaging, light field cameras typically are of single shot, and however, +they heavily suffer from low spatial resolution and depth accuracy. In this +paper, by employing an optical projector to project a group of single +high-frequency phase-shifted sinusoid patterns, we propose a phase guided light +field algorithm to significantly improve both the spatial and depth resolutions +for off-the-shelf light field cameras. First, for correcting the axial +aberrations caused by the main lens of our light field camera, we propose a +deformed cone model to calibrate our structured light field system. Second, +over wrapped phases computed from patterned images, we propose a stereo +matching algorithm, i.e. phase guided sum of absolute difference, to robustly +obtain the correspondence for each pair of neighbored two lenslets. Finally, by +introducing a virtual camera according to the basic geometrical optics of light +field imaging, we propose a reorganization strategy to reconstruct 3D point +clouds with spatial-depth high resolution. Experimental results show that, +compared with the state-of-the-art active light field methods, the proposed +reconstructs 3D point clouds with a spatial resolution of 1280$\times$720 with +factors 10$\times$ increased, while maintaining the same high depth resolution +and needing merely a single group of high-frequency patterns. + +
+
+
+
+
+ + ♻ ☆ Text-Based Reasoning About Vector Graphics + + +
+ While large multimodal models excel in broad vision-language benchmarks, they +often struggle with tasks requiring precise perception of low-level visual +details, such as comparing line lengths or solving simple mazes. In particular, +this failure mode persists in question-answering tasks about vector graphics -- +images composed purely of 2D objects and shapes. To address this challenge, we +propose the Visually Descriptive Language Model (VDLM), which performs +text-based reasoning about vector graphics. VDLM leverages Scalable Vector +Graphics (SVG) for a more precise visual description and first uses an +off-the-shelf raster-to-SVG algorithm for encoding. Since existing language +models cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG +with pretrained language models through a newly introduced intermediate +symbolic representation, Primal Visual Description (PVD), comprising primitive +attributes (e.g., shape, position, measurement) with their corresponding +predicted values. PVD is task-agnostic and represents visual primitives that +are universal across all vector graphics. It can be learned with procedurally +generated (SVG, PVD) pairs and also enables the direct use of LLMs for +generalization to complex reasoning tasks. By casting an image to a text-based +representation, we can leverage the power of language models to learn alignment +from SVG to visual primitives and generalize to unseen question-answering +tasks. Empirical results show that VDLM achieves stronger zero-shot performance +compared to state-of-the-art LMMs, such as GPT-4V, in various low-level +multimodal perception and reasoning tasks on vector graphics. We additionally +present extensive analyses on VDLM's performance, demonstrating that our +framework offers better interpretability due to its disentangled perception and +reasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/ + +
+
+ comment: Project page: https://mikewangwzhl.github.io/VDLM/ +
+
+
+
+
+ + ♻ ☆ Multi-Level Label Correction by Distilling Proximate Patterns for + Semi-supervised Semantic Segmentation + + +
+ Semi-supervised semantic segmentation relieves the reliance on large-scale +labeled data by leveraging unlabeled data. Recent semi-supervised semantic +segmentation approaches mainly resort to pseudo-labeling methods to exploit +unlabeled data. However, unreliable pseudo-labeling can undermine the +semi-supervision processes. In this paper, we propose an algorithm called +Multi-Level Label Correction (MLLC), which aims to use graph neural networks to +capture structural relationships in Semantic-Level Graphs (SLGs) and +Class-Level Graphs (CLGs) to rectify erroneous pseudo-labels. Specifically, +SLGs represent semantic affinities between pairs of pixel features, and CLGs +describe classification consistencies between pairs of pixel labels. With the +support of proximate pattern information from graphs, MLLC can rectify +incorrectly predicted pseudo-labels and can facilitate discriminative feature +representations. We design an end-to-end network to train and perform this +effective label corrections mechanism. Experiments demonstrate that MLLC can +significantly improve supervised baselines and outperforms state-of-the-art +approaches in different scenarios on Cityscapes and PASCAL VOC 2012 datasets. +Specifically, MLLC improves the supervised baseline by at least 5% and 2% with +DeepLabV2 and DeepLabV3+ respectively under different partition protocols. + +
+
+ comment: 12 pages, 8 figures. IEEE Transactions on Multimedia, 2024 +
+
+
+
+
+ + ♻ ☆ Leveraging Diffusion For Strong and High Quality Face Morphing Attacks + + +
+ Face morphing attacks seek to deceive a Face Recognition (FR) system by +presenting a morphed image consisting of the biometric qualities from two +different identities with the aim of triggering a false acceptance with one of +the two identities, thereby presenting a significant threat to biometric +systems. The success of a morphing attack is dependent on the ability of the +morphed image to represent the biometric characteristics of both identities +that were used to create the image. We present a novel morphing attack that +uses a Diffusion-based architecture to improve the visual fidelity of the image +and the ability of the morphing attack to represent characteristics from both +identities. We demonstrate the effectiveness of the proposed attack by +evaluating its visual fidelity via the Frechet Inception Distance (FID). Also, +extensive experiments are conducted to measure the vulnerability of FR systems +to the proposed attack. The ability of a morphing attack detector to detect the +proposed attack is measured and compared against two state-of-the-art GAN-based +morphing attacks along with two Landmark-based attacks. Additionally, a novel +metric to measure the relative strength between different morphing attacks is +introduced and evaluated. + +
+
+ comment: Diffusion Morphs (DiM) paper. Accepted in IEEE TBIOM +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Attention and Gaussian Processes for Personalized Video + Gaze Estimation CVPR 2024 + + +
+ Gaze is an essential prompt for analyzing human behavior and attention. +Recently, there has been an increasing interest in determining gaze direction +from facial videos. However, video gaze estimation faces significant +challenges, such as understanding the dynamic evolution of gaze in video +sequences, dealing with static backgrounds, and adapting to variations in +illumination. To address these challenges, we propose a simple and novel deep +learning model designed to estimate gaze from videos, incorporating a +specialized attention module. Our method employs a spatial attention mechanism +that tracks spatial dynamics within videos. This technique enables accurate +gaze direction prediction through a temporal sequence model, adeptly +transforming spatial observations into temporal insights, thereby significantly +improving gaze estimation accuracy. Additionally, our approach integrates +Gaussian processes to include individual-specific traits, facilitating the +personalization of our model with just a few labeled samples. Experimental +results confirm the efficacy of the proposed approach, demonstrating its +success in both within-dataset and cross-dataset settings. Specifically, our +proposed approach achieves state-of-the-art performance on the Gaze360 dataset, +improving by $2.5^\circ$ without personalization. Further, by personalizing the +model with just three samples, we achieved an additional improvement of +$0.8^\circ$. The code and pre-trained models are available at +\url{https://github.com/jswati31/stage}. + +
+
+ comment: Accepted at CVPR 2024 Gaze workshop +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 45 + +
+
+
+ + ☆ PAAM: A Framework for Coordinated and Priority-Driven Accelerator + Management in ROS 2 + + +
+ This paper proposes a Priority-driven Accelerator Access Management (PAAM) +framework for multi-process robotic applications built on top of the Robot +Operating System (ROS) 2 middleware platform. The framework addresses the issue +of predictable execution of time- and safety-critical callback chains that +require hardware accelerators such as GPUs and TPUs. PAAM provides a standalone +ROS executor that acts as an accelerator resource server, arbitrating +accelerator access requests from all other callbacks at the application layer. +This approach enables coordinated and priority-driven accelerator access +management in multi-process robotic systems. The framework design is directly +applicable to all types of accelerators and enables granular control over how +specific chains access accelerators, making it possible to achieve predictable +real-time support for accelerators used by safety-critical callback chains +without making changes to underlying accelerator device drivers. The paper +shows that PAAM also offers a theoretical analysis that can upper bound the +worst-case response time of safety-critical callback chains that necessitate +accelerator access. This paper also demonstrates that complex robotic systems +with extensive accelerator usage that are integrated with PAAM may achieve up +to a 91\% reduction in end-to-end response time of their critical callback +chains. + +
+
+ comment: 14 Pages, 14 Figures +
+
+
+
+
+ + ☆ QueSTMaps: Queryable Semantic Topological Maps for 3D Scene + Understanding + + +
+ Understanding the structural organisation of 3D indoor scenes in terms of +rooms is often accomplished via floorplan extraction. Robotic tasks such as +planning and navigation require a semantic understanding of the scene as well. +This is typically achieved via object-level semantic segmentation. However, +such methods struggle to segment out topological regions like "kitchen" in the +scene. In this work, we introduce a two-step pipeline. First, we extract a +topological map, i.e., floorplan of the indoor scene using a novel +multi-channel occupancy representation. Then, we generate CLIP-aligned features +and semantic labels for every room instance based on the objects it contains +using a self-attention transformer. Our language-topology alignment supports +natural language querying, e.g., a "place to cook" locates the "kitchen". We +outperform the current state-of-the-art on room segmentation by ~20% and room +classification by ~12%. Our detailed qualitative analysis and ablation studies +provide insights into the problem of joint structural and semantic 3D scene +understanding. + +
+
+
+
+
+ + ☆ Deep Reinforcement Learning-Based Approach for a Single Vehicle + Persistent Surveillance Problem with Fuel Constraints + + +
+ This article presents a deep reinforcement learning-based approach to tackle +a persistent surveillance mission requiring a single unmanned aerial vehicle +initially stationed at a depot with fuel or time-of-flight constraints to +repeatedly visit a set of targets with equal priority. Owing to the vehicle's +fuel or time-of-flight constraints, the vehicle must be regularly refueled, or +its battery must be recharged at the depot. The objective of the problem is to +determine an optimal sequence of visits to the targets that minimizes the +maximum time elapsed between successive visits to any target while ensuring +that the vehicle never runs out of fuel or charge. We present a deep +reinforcement learning algorithm to solve this problem and present the results +of numerical experiments that corroborate the effectiveness of this approach in +comparison with common-sense greedy heuristics. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ Large Language Models to the Rescue: Deadlock Resolution in Multi-Robot + Systems + + +
+ Multi-agent robotic systems are prone to deadlocks in an obstacle environment +where the system can get stuck away from its desired location under a smooth +low-level control policy. Without an external intervention, often in terms of a +high-level command, it is not possible to guarantee that just a low-level +control policy can resolve such deadlocks. Utilizing the generalizability and +low data requirements of large language models (LLMs), this paper explores the +possibility of using LLMs for deadlock resolution. We propose a hierarchical +control framework where an LLM resolves deadlocks by assigning a leader and +direction for the leader to move along. A graph neural network (GNN) based +low-level distributed control policy executes the assigned plan. We +systematically study various prompting techniques to improve LLM's performance +in resolving deadlocks. In particular, as part of prompt engineering, we +provide in-context examples for LLMs. We conducted extensive experiments on +various multi-robot environments with up to 15 agents and 40 obstacles. Our +results demonstrate that LLM-based high-level planners are effective in +resolving deadlocks in MRS. + +
+
+
+
+
+ + ☆ Policy-Guided Diffusion NeurIPS 2023 + + +
+ In many real-world settings, agents must learn from an offline dataset +gathered by some prior behavior policy. Such a setting naturally leads to +distribution shift between the behavior policy and the target policy being +trained - requiring policy conservatism to avoid instability and overestimation +bias. Autoregressive world models offer a different solution to this by +generating synthetic, on-policy experience. However, in practice, model +rollouts must be severely truncated to avoid compounding error. As an +alternative, we propose policy-guided diffusion. Our method uses diffusion +models to generate entire trajectories under the behavior distribution, +applying guidance from the target policy to move synthetic experience further +on-policy. We show that policy-guided diffusion models a regularized form of +the target distribution that balances action likelihood under both the target +and behavior policies, leading to plausible trajectories with high target +policy probability, while retaining a lower dynamics error than an offline +world model baseline. Using synthetic experience from policy-guided diffusion +as a drop-in substitute for real data, we demonstrate significant improvements +in performance across a range of standard offline reinforcement learning +algorithms and environments. Our approach provides an effective alternative to +autoregressive offline world models, opening the door to the controllable +generation of synthetic training data. + +
+
+ comment: Previously at the NeurIPS 2023 Workshop on Robot Learning +
+
+
+
+
+ + ☆ DaF-BEVSeg: Distortion-aware Fisheye Camera based Bird's Eye View + Segmentation with Occlusion Reasoning + + +
+ Semantic segmentation is an effective way to perform scene understanding. +Recently, segmentation in 3D Bird's Eye View (BEV) space has become popular as +its directly used by drive policy. However, there is limited work on BEV +segmentation for surround-view fisheye cameras, commonly used in commercial +vehicles. As this task has no real-world public dataset and existing synthetic +datasets do not handle amodal regions due to occlusion, we create a synthetic +dataset using the Cognata simulator comprising diverse road types, weather, and +lighting conditions. We generalize the BEV segmentation to work with any camera +model; this is useful for mixing diverse cameras. We implement a baseline by +applying cylindrical rectification on the fisheye images and using a standard +LSS-based BEV segmentation model. We demonstrate that we can achieve better +performance without undistortion, which has the adverse effects of increased +runtime due to pre-processing, reduced field-of-view, and resampling artifacts. +Further, we introduce a distortion-aware learnable BEV pooling strategy that is +more effective for the fisheye cameras. We extend the model with an occlusion +reasoning module, which is critical for estimating in BEV space. Qualitative +performance of DaF-BEVSeg is showcased in the video at +https://streamable.com/ge4v51. + +
+
+
+
+
+ + ☆ AgentsCoDriver: Large Language Model Empowered Collaborative Driving + with Lifelong Learning + + +
+ Connected and autonomous driving is developing rapidly in recent years. +However, current autonomous driving systems, which are primarily based on +data-driven approaches, exhibit deficiencies in interpretability, +generalization, and continuing learning capabilities. In addition, the +single-vehicle autonomous driving systems lack of the ability of collaboration +and negotiation with other vehicles, which is crucial for the safety and +efficiency of autonomous driving systems. In order to address these issues, we +leverage large language models (LLMs) to develop a novel framework, +AgentsCoDriver, to enable multiple vehicles to conduct collaborative driving. +AgentsCoDriver consists of five modules: observation module, reasoning engine, +cognitive memory module, reinforcement reflection module, and communication +module. It can accumulate knowledge, lessons, and experiences over time by +continuously interacting with the environment, thereby making itself capable of +lifelong learning. In addition, by leveraging the communication module, +different agents can exchange information and realize negotiation and +collaboration in complex traffic environments. Extensive experiments are +conducted and show the superiority of AgentsCoDriver. + +
+
+
+
+
+ + ☆ Experimental System Design of an Active Fault-Tolerant Quadrotor + + +
+ Quadrotors have gained popularity over the last decade, aiding humans in +complex tasks such as search and rescue, mapping and exploration. Despite their +mechanical simplicity and versatility compared to other types of aerial +vehicles, they remain vulnerable to rotor failures. In this paper, we propose +an algorithmic and mechanical approach to addressing the quadrotor +fault-tolerant problem in case of rotor failures. First, we present a +fault-tolerant detection and control scheme that includes various attitude +error metrics. The scheme transitions to a fault-tolerant control mode by +surrendering the yaw control. Subsequently, to ensure compatibility with +platform sensing constraints, we investigate the relationship between +variations in robot rotational drag, achieved through a modular mechanical +design appendage, resulting in yaw rates within sensor limits. This analysis +offers a platform-agnostic framework for designing more reliable and robust +quadrotors in the event of rotor failures. Extensive experimental results +validate the proposed approach providing insights into successfully designing a +cost-effective quadrotor capable of fault-tolerant control. The overall design +enhances safety in scenarios of faulty rotors, without the need for additional +sensors or computational resources. + +
+
+ comment: Accepted to ICUAS 2024 +
+
+
+
+
+ + ☆ Statistical Modelling of Driving Scenarios in Road Traffic using Fleet + Data of Production Vehicles + + +
+ Ensuring the safety of road vehicles at an acceptable level requires the +absence of any unreasonable risk arising from all potential hazards linked to +the intended au-tomated driving function and its implementation. The assurance +that there are no unreasonable risks stemming from hazardous behaviours +associated to functional insufficiencies is denoted as safety of intended +functionality (SOTIF), a concept outlined in the ISO 21448 standard. In this +context, the acquisition of real driving data is considered essential for the +verification and validation. For this purpose, we are currently developing a +method with which data collect-ed representatively from production vehicles can +be modelled into a knowledge-based system in the future. A system that +represents the probabilities of occur-rence of concrete driving scenarios over +the statistical population of road traffic and makes them usable. The method +includes the qualitative and quantitative ab-straction of the drives recorded +by the sensors in the vehicles, the possibility of subsequent wireless +transmission of the abstracted data from the vehicles and the derivation of the +distributions and correlations of scenario parameters. This paper provides a +summary of the research project and outlines its central idea. To this end, +among other things, the needs for statistical information and da-ta from road +traffic are elaborated from ISO 21448, the current state of research is +addressed, and methodical aspects are discussed. + +
+
+ comment: 12 pages, 4 figures, the article has been accepted for publication + and presentation during the 9th International ATZ Conference on Automated + Driving 2024 +
+
+
+
+
+ + ☆ Playing to Vision Foundation Model's Strengths in Stereo Matching + + +
+ Stereo matching has become a key technique for 3D environment perception in +intelligent vehicles. For a considerable time, convolutional neural networks +(CNNs) have remained the mainstream choice for feature extraction in this +domain. Nonetheless, there is a growing consensus that the existing paradigm +should evolve towards vision foundation models (VFM), particularly those +developed based on vision Transformers (ViTs) and pre-trained through +self-supervision on extensive, unlabeled datasets. While VFMs are adept at +extracting informative, general-purpose visual features, specifically for dense +prediction tasks, their performance often lacks in geometric vision tasks. This +study serves as the first exploration of a viable approach for adapting VFMs to +stereo matching. Our ViT adapter, referred to as ViTAS, is constructed upon +three types of modules: spatial differentiation, patch attention fusion, and +cross-attention. The first module initializes feature pyramids, while the +latter two aggregate stereo and multi-scale contextual information into +fine-grained features, respectively. ViTAStereo, which combines ViTAS with cost +volume-based stereo matching back-end processes, achieves the top rank on the +KITTI Stereo 2012 dataset and outperforms the second-best network StereoBase by +approximately 7.9% in terms of the percentage of error pixels, with a tolerance +of 3 pixels. Additional experiments across diverse scenarios further +demonstrate its superior generalizability compared to all other +state-of-the-art approaches. We believe this new paradigm will pave the way for +the next generation of stereo matching networks. + +
+
+
+
+
+ + ☆ Label-Efficient 3D Object Detection For Road-Side Units + + +
+ Occlusion presents a significant challenge for safety-critical applications +such as autonomous driving. Collaborative perception has recently attracted a +large research interest thanks to the ability to enhance the perception of +autonomous vehicles via deep information fusion with intelligent roadside units +(RSU), thus minimizing the impact of occlusion. While significant advancement +has been made, the data-hungry nature of these methods creates a major hurdle +for their real-world deployment, particularly due to the need for annotated RSU +data. Manually annotating the vast amount of RSU data required for training is +prohibitively expensive, given the sheer number of intersections and the effort +involved in annotating point clouds. We address this challenge by devising a +label-efficient object detection method for RSU based on unsupervised object +discovery. Our paper introduces two new modules: one for object discovery based +on a spatial-temporal aggregation of point clouds, and another for refinement. +Furthermore, we demonstrate that fine-tuning on a small portion of annotated +data allows our object discovery models to narrow the performance gap with, or +even surpass, fully supervised models. Extensive experiments are carried out in +simulated and real-world datasets to evaluate our method. + +
+
+ comment: IV 2024 +
+
+
+
+
+ + ☆ Towards Autonomous Driving with Small-Scale Cars: A Survey of Recent + Development + + +
+ While engaging with the unfolding revolution in autonomous driving, a +challenge presents itself, how can we effectively raise awareness within +society about this transformative trend? While full-scale autonomous driving +vehicles often come with a hefty price tag, the emergence of small-scale car +platforms offers a compelling alternative. These platforms not only serve as +valuable educational tools for the broader public and young generations but +also function as robust research platforms, contributing significantly to the +ongoing advancements in autonomous driving technology. This survey outlines +various small-scale car platforms, categorizing them and detailing the research +advancements accomplished through their usage. The conclusion provides +proposals for promising future directions in the field. + +
+
+
+
+
+ + ☆ AI-MOLE: Autonomous Iterative Motion Learning for Unknown Nonlinear + Dynamics with Extensive Experimental Validation + + +
+ This work proposes Autonomous Iterative Motion Learning (AI-MOLE), a method +that enables systems with unknown, nonlinear dynamics to autonomously learn to +solve reference tracking tasks. The method iteratively applies an input +trajectory to the unknown dynamics, trains a Gaussian process model based on +the experimental data, and utilizes the model to update the input trajectory +until desired tracking performance is achieved. Unlike existing approaches, the +proposed method determines necessary parameters automatically, i.e., AI-MOLE +works plug-and-play and without manual parameter tuning. Furthermore, AI-MOLE +only requires input/output information, but can also exploit available state +information to accelerate learning. + While other approaches are typically only validated in simulation or on a +single real-world testbed using manually tuned parameters, we present the +unprecedented result of validating the proposed method on three different +real-world robots and a total of nine different reference tracking tasks +without requiring any a priori model information or manual parameter tuning. +Over all systems and tasks, AI-MOLE rapidly learns to track the references +without requiring any manual parameter tuning at all, even if only input/output +information is available. + +
+
+ comment: 9 pages, 6 figures, journal article +
+
+
+
+
+ + ☆ Resilient Movement Planning for Continuum Robots + + +
+ The paper presents an experimental study of resilient path planning for +con-tinuum robots taking into account the multi-objective optimisation problem. +To do this, we used two well-known algorithms, namely Genetic algorithm and A* +algorithm, for path planning and the Analytical Hierarchy Process al-gorithm +for paths evaluation. In our experiment Analytical Hierarchy Process algorithm +considers four different criteria, i.e. distance, motors damage, me-chanical +damage and accuracy each considered to contribute to the resilience of a +continuum robot. The use of different criteria is necessary to increasing the +time to maintenance operations of the robot. The experiment shows that on the +one hand both algorithms can be used in combination with Analytical Hierarchy +Process algorithm for multi criteria path-planning, while Genetic algorithm +shows superior performance in the comparison of the two algo-rithms. + +
+
+
+
+
+ + ☆ Intelligence and Motion Models of Continuum Robots: an Overview + + +
+ Many technical solutions are bio-inspired. Octopus-inspired robotic arms +belong to continuum robots which are used in minimally invasive surgery or for +technical system restoration in areas difficult-toaccess. Continuum robot +missions are bounded with their motions, whereby the motion of the robots is +controlled by humans via wireless communication. In case of a lost connection, +robot autonomy is required. Distributed control and distributed decision-making +mechanisms based on artificial intelligence approaches can be a promising +solution to achieve autonomy of technical systems and to increase their +resilience. However these methods are not well investigated yet. Octopuses are +the living example of natural distributed intelligence but their learning and +decision-making mechanisms are also not fully investigated and understood yet. +Our major interest is investigating mechanisms of Distributed Artificial +Intelligence as a basis for improving resilience of complex systems. We decided +to use a physical continuum robot prototype that is able to perform some basic +movements for our research. The idea is to research how a technical system can +be empowered to combine movements into sequences of motions by itself. For the +experimental investigations a suitable physical prototype has to be selected, +its motion control has to be implemented and automated. In this paper, we give +an overview combining different fields of research, such as Distributed +Artificial Intelligence and continuum robots based on 98 publications. We +provide a detailed description of the basic motion control models of continuum +robots based on the literature reviewed, discuss different aspects of autonomy +and give an overview of physical prototypes of continuum robots. + +
+
+
+
+
+ + ☆ Distributed Artificial Intelligence as a Means to Achieve + Self-X-Functions for Increasing Resilience: the First Steps + + +
+ Using sensors as a means to achieve self-awareness and artificial +intelligence for decision-making, may be a way to make complex systems +self-adaptive, autonomous and resilient. Investigating the combination of +distributed artificial intelligence methods and bio-inspired robotics can +provide results that will be helpful for implementing autonomy of such robots +and other complex systems. In this paper, we describe Distributed Artificial +Intelligence application area, the most common examples of continuum robots and +provide a description of our first steps towards implementing distributed +control. + +
+
+
+
+
+ + ☆ Efficient and Robust Point Cloud Registration via Heuristics-guided + Parameter Search + + +
+ Estimating the rigid transformation with 6 degrees of freedom based on a +putative 3D correspondence set is a crucial procedure in point cloud +registration. Existing correspondence identification methods usually lead to +large outlier ratios ($>$ 95 $\%$ is common), underscoring the significance of +robust registration methods. Many researchers turn to parameter search-based +strategies (e.g., Branch-and-Bround) for robust registration. Although related +methods show high robustness, their efficiency is limited to the +high-dimensional search space. This paper proposes a heuristics-guided +parameter search strategy to accelerate the search while maintaining high +robustness. We first sample some correspondences (i.e., heuristics) and then +just need to sequentially search the feasible regions that make each sample an +inlier. Our strategy largely reduces the search space and can guarantee +accuracy with only a few inlier samples, therefore enjoying an excellent +trade-off between efficiency and robustness. Since directly parameterizing the +6-dimensional nonlinear feasible region for efficient search is intractable, we +construct a three-stage decomposition pipeline to reparameterize the feasible +region, resulting in three lower-dimensional sub-problems that are easily +solvable via our strategy. Besides reducing the searching dimension, our +decomposition enables the leverage of 1-dimensional interval stabbing at all +three stages for searching acceleration. Moreover, we propose a valid sampling +strategy to guarantee our sampling effectiveness, and a compatibility +verification setup to further accelerate our search. Extensive experiments on +both simulated and real-world datasets demonstrate that our approach exhibits +comparable robustness with state-of-the-art methods while achieving a +significant efficiency boost. + +
+
+ comment: 21 pages, 16 figures. Accepted to IEEE Transactions on Pattern + Analysis and Machine Intelligence, 2024 +
+
+
+
+
+ + ☆ Adaptable Recovery Behaviors in Robotics: A Behavior Trees and Motion + Generators(BTMG) Approach for Failure Management + + +
+ In dynamic operational environments, particularly in collaborative robotics, +the inevitability of failures necessitates robust and adaptable recovery +strategies. Traditional automated recovery strategies, while effective for +predefined scenarios, often lack the flexibility required for on-the-fly task +management and adaptation to expected failures. Addressing this gap, we propose +a novel approach that models recovery behaviors as adaptable robotic skills, +leveraging the Behavior Trees and Motion Generators~(BTMG) framework for policy +representation. This approach distinguishes itself by employing reinforcement +learning~(RL) to dynamically refine recovery behavior parameters, enabling a +tailored response to a wide array of failure scenarios with minimal human +intervention. We assess our methodology through a series of progressively +challenging scenarios within a peg-in-a-hole task, demonstrating the approach's +effectiveness in enhancing operational efficiency and task success rates in +collaborative robotics settings. We validate our approach using a dual-arm KUKA +robot. + +
+
+
+
+
+ + ☆ Hierarchical Insights: Exploiting Structural Similarities for Reliable + 3D Semantic Segmentation IROS 2024 + + +
+ Safety-critical applications like autonomous driving call for robust 3D +environment perception algorithms which can withstand highly diverse and +ambiguous surroundings. The predictive performance of any classification model +strongly depends on the underlying dataset and the prior knowledge conveyed by +the annotated labels. While the labels provide a basis for the learning +process, they usually fail to represent inherent relations between the classes +- representations, which are a natural element of the human perception system. +We propose a training strategy which enables a 3D LiDAR semantic segmentation +model to learn structural relationships between the different classes through +abstraction. We achieve this by implicitly modeling those relationships through +a learning rule for hierarchical multi-label classification (HMC). With a +detailed analysis we show, how this training strategy not only improves the +model's confidence calibration, but also preserves additional information for +downstream tasks like fusion, prediction and planning. + +
+
+ comment: submitted to IROS 2024 +
+
+
+
+
+ + ☆ EVE: Enabling Anyone to Train Robot using Augmented Reality + + +
+ The increasing affordability of robot hardware is accelerating the +integration of robots into everyday activities. However, training a robot to +automate a task typically requires physical robots and expensive demonstration +data from trained human annotators. Consequently, only those with access to +physical robots produce demonstrations to train robots. To mitigate this issue, +we introduce EVE, an iOS app that enables everyday users to train robots using +intuitive augmented reality visualizations without needing a physical robot. +With EVE, users can collect demonstrations by specifying waypoints with their +hands, visually inspecting the environment for obstacles, modifying existing +waypoints, and verifying collected trajectories. In a user study ($N=14$, +$D=30$) consisting of three common tabletop tasks, EVE outperformed three +state-of-the-art interfaces in success rate and was comparable to kinesthetic +teaching-physically moving a real robot-in completion time, usability, motion +intent communication, enjoyment, and preference ($mean_{p}=0.30$). We conclude +by enumerating limitations and design considerations for future AR-based +demonstration collection systems for robotics. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Incremental Joint Learning of Depth, Pose and Implicit Scene + Representation on Monocular Camera in Large-scale Scenes + + +
+ Dense scene reconstruction for photo-realistic view synthesis has various +applications, such as VR/AR, autonomous vehicles. However, most existing +methods have difficulties in large-scale scenes due to three core challenges: +\textit{(a) inaccurate depth input.} Accurate depth input is impossible to get +in real-world large-scale scenes. \textit{(b) inaccurate pose estimation.} Most +existing approaches rely on accurate pre-estimated camera poses. \textit{(c) +insufficient scene representation capability.} A single global radiance field +lacks the capacity to effectively scale to large-scale scenes. To this end, we +propose an incremental joint learning framework, which can achieve accurate +depth, pose estimation, and large-scale scene reconstruction. A vision +transformer-based network is adopted as the backbone to enhance performance in +scale information estimation. For pose estimation, a feature-metric bundle +adjustment (FBA) method is designed for accurate and robust camera tracking in +large-scale scenes. In terms of implicit scene representation, we propose an +incremental scene representation method to construct the entire large-scale +scene as multiple local radiance fields to enhance the scalability of 3D scene +representation. Extended experiments have been conducted to demonstrate the +effectiveness and accuracy of our method in depth estimation, pose estimation, +and large-scale scene reconstruction. + +
+
+
+
+
+ + ☆ Diffusion-Based Point Cloud Super-Resolution for mmWave Radar Data + + +
+ The millimeter-wave radar sensor maintains stable performance under adverse +environmental conditions, making it a promising solution for all-weather +perception tasks, such as outdoor mobile robotics. However, the radar point +clouds are relatively sparse and contain massive ghost points, which greatly +limits the development of mmWave radar technology. In this paper, we propose a +novel point cloud super-resolution approach for 3D mmWave radar data, named +Radar-diffusion. Our approach employs the diffusion model defined by +mean-reverting stochastic differential equations(SDE). Using our proposed new +objective function with supervision from corresponding LiDAR point clouds, our +approach efficiently handles radar ghost points and enhances the sparse mmWave +radar point clouds to dense LiDAR-like point clouds. We evaluate our approach +on two different datasets, and the experimental results show that our method +outperforms the state-of-the-art baseline methods in 3D radar super-resolution +tasks. Furthermore, we demonstrate that our enhanced radar point cloud is +capable of downstream radar point-based registration tasks. + +
+
+
+
+
+ + ☆ 3D Branch Point Cloud Completion for Robotic Pruning in Apple Orchards IROS2024 + + +
+ Robotic branch pruning is a significantly growing research area to cope with +the shortage of labor force in the context of agriculture. One fundamental +requirement in robotic pruning is the perception of detailed geometry and +topology of branches. However, the point clouds obtained in agricultural +settings often exhibit incompleteness due to several constraints, thereby +restricting the accuracy of downstream robotic pruning. In this work, we +addressed the issue of point cloud quality through a simulation-based deep +neural network, leveraging a Real-to-Simulation (Real2Sim) data generation +pipeline that not only eliminates the need for manual parameterization but also +guarantees the realism of simulated data. The simulation-based neural network +was applied to jointly perform point cloud completion and skeletonization on +real-world partial branches, without additional real-world training. The +Sim2Real qualitative completion and skeletonization results showed the model's +remarkable capability for geometry reconstruction and topology prediction. +Additionally, we quantitatively evaluated the Sim2Real performance by comparing +branch-level trait characterization errors using raw incomplete data and +complete data. The Mean Absolute Error (MAE) reduced by 75% and 8% for branch +diameter and branch angle estimation, respectively, using the best complete +data, which indicates the effectiveness of the Real2Sim data in a zero-shot +generalization setting. The characterization improvements contributed to the +precision and efficacy of robotic branch pruning. + +
+
+ comment: Submitted to IROS2024 +
+
+
+
+
+ + ☆ Robot Safe Planning In Dynamic Environments Based On Model Predictive + Control Using Control Barrier Function + + +
+ Implementing obstacle avoidance in dynamic environments is a challenging +problem for robots. Model predictive control (MPC) is a popular strategy for +dealing with this type of problem, and recent work mainly uses control barrier +function (CBF) as hard constraints to ensure that the system state remains in +the safe set. However, in crowded scenarios, effective solutions may not be +obtained due to infeasibility problems, resulting in degraded controller +performance. We propose a new MPC framework that integrates CBF to tackle the +issue of obstacle avoidance in dynamic environments, in which the infeasibility +problem induced by hard constraints operating over the whole prediction horizon +is solved by softening the constraints and introducing exact penalty, prompting +the robot to actively seek out new paths. At the same time, generalized CBF is +extended as a single-step safety constraint of the controller to enhance the +safety of the robot during navigation. The efficacy of the proposed method is +first shown through simulation experiments, in which a double-integrator system +and a unicycle system are employed, and the proposed method outperforms other +controllers in terms of safety, feasibility, and navigation efficiency. +Furthermore, real-world experiment on an MR1000 robot is implemented to +demonstrate the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ Efficient Multi-Task Reinforcement Learning via Task-Specific Action + Correction + + +
+ Multi-task reinforcement learning (MTRL) demonstrate potential for enhancing +the generalization of a robot, enabling it to perform multiple tasks +concurrently. However, the performance of MTRL may still be susceptible to +conflicts between tasks and negative interference. To facilitate efficient +MTRL, we propose Task-Specific Action Correction (TSAC), a general and +complementary approach designed for simultaneous learning of multiple tasks. +TSAC decomposes policy learning into two separate policies: a shared policy +(SP) and an action correction policy (ACP). To alleviate conflicts resulting +from excessive focus on specific tasks' details in SP, ACP incorporates +goal-oriented sparse rewards, enabling an agent to adopt a long-term +perspective and achieve generalization across tasks. Additional rewards +transform the original problem into a multi-objective MTRL problem. +Furthermore, to convert the multi-objective MTRL into a single-objective +formulation, TSAC assigns a virtual expected budget to the sparse rewards and +employs Lagrangian method to transform a constrained single-objective +optimization into an unconstrained one. Experimental evaluations conducted on +Meta-World's MT10 and MT50 benchmarks demonstrate that TSAC outperforms +existing state-of-the-art methods, achieving significant improvements in both +sample efficiency and effective action execution. + +
+
+
+
+
+ + ☆ Body Design and Gait Generation of Chair-Type Asymmetrical Tripedal + Low-rigidity Robot + + +
+ In this study, a chair-type asymmetric tripedal low-rigidity robot was +designed based on the three-legged chair character in the movie "Suzume" and +its gait was generated. Its body structure consists of three legs that are +asymmetric to the body, so it cannot be easily balanced. In addition, the +actuator is a servo motor that can only feed-forward rotational angle commands +and the sensor can only sense the robot's posture quaternion. In such an +asymmetric and imperfect body structure, we analyzed how gait is generated in +walking and stand-up motions by generating gaits with two different methods: a +method using linear completion to connect the postures necessary for the gait +discovered through trial and error using the actual robot, and a method using +the gait generated by reinforcement learning in the simulator and reflecting it +to the actual robot. Both methods were able to generate gait that realized +walking and stand-up motions, and interesting gait patterns were observed, +which differed depending on the method, and were confirmed on the actual robot. +Our code and demonstration videos are available here: +https://github.com/shin0805/Chair-TypeAsymmetricalTripedalRobot.git + +
+
+ comment: Accepted at RoboSoft2024, website - + https://shin0805.github.io/chair-type-tripedal-robot/ , YouTube - + https://youtu.be/-f8LDlhmdBg +
+
+
+
+
+ + ☆ GenCHiP: Generating Robot Policy Code for High-Precision and + Contact-Rich Manipulation Tasks + + +
+ Large Language Models (LLMs) have been successful at generating robot policy +code, but so far these results have been limited to high-level tasks that do +not require precise movement. It is an open question how well such approaches +work for tasks that require reasoning over contact forces and working within +tight success tolerances. We find that, with the right action space, LLMs are +capable of successfully generating policies for a variety of contact-rich and +high-precision manipulation tasks, even under noisy conditions, such as +perceptual errors or grasping inaccuracies. Specifically, we reparameterize the +action space to include compliance with constraints on the interaction forces +and stiffnesses involved in reaching a target pose. We validate this approach +on subtasks derived from the Functional Manipulation Benchmark (FMB) and NIST +Task Board Benchmarks. Exposing this action space alongside methods for +estimating object poses improves policy generation with an LLM by greater than +3x and 4x when compared to non-compliant action spaces + +
+
+ comment: 14 pages, 12 figures +
+
+
+
+
+ + ☆ Counting Objects in a Robotic Hand + + +
+ A robot performing multi-object grasping needs to sense the number of objects +in the hand after grasping. The count plays an important role in determining +the robot's next move and the outcome and efficiency of the whole pick-place +process. This paper presents a data-driven contrastive learning-based counting +classifier with a modified loss function as a simple and effective approach for +object counting despite significant occlusion challenges caused by robotic +fingers and objects. The model was validated against other models with three +different common shapes (spheres, cylinders, and cubes) in simulation and in a +real setup. The proposed contrastive learning-based counting approach achieved +above 96\% accuracy for all three objects in the real setup. + +
+
+
+
+
+ + ☆ GOAT-Bench: A Benchmark for Multi-Modal Lifelong Navigation + + +
+ The Embodied AI community has made significant strides in visual navigation +tasks, exploring targets from 3D coordinates, objects, language descriptions, +and images. However, these navigation models often handle only a single input +modality as the target. With the progress achieved so far, it is time to move +towards universal navigation models capable of handling various goal types, +enabling more effective user interaction with robots. To facilitate this goal, +we propose GOAT-Bench, a benchmark for the universal navigation task referred +to as GO to AnyThing (GOAT). In this task, the agent is directed to navigate to +a sequence of targets specified by the category name, language description, or +image in an open-vocabulary fashion. We benchmark monolithic RL and modular +methods on the GOAT task, analyzing their performance across modalities, the +role of explicit and implicit scene memories, their robustness to noise in goal +specifications, and the impact of memory in lifelong scenarios. + +
+
+
+
+
+ + ☆ MORPHeus: a Multimodal One-armed Robot-assisted Peeling System with + Human Users In-the-loop + + +
+ Meal preparation is an important instrumental activity of daily +living~(IADL). While existing research has explored robotic assistance in meal +preparation tasks such as cutting and cooking, the crucial task of peeling has +received less attention. Robot-assisted peeling, conventionally a bimanual +task, is challenging to deploy in the homes of care recipients using two +wheelchair-mounted robot arms due to ergonomic and transferring challenges. +This paper introduces a robot-assisted peeling system utilizing a single +robotic arm and an assistive cutting board, inspired by the way individuals +with one functional hand prepare meals. Our system incorporates a multimodal +active perception module to determine whether an area on the food is peeled, a +human-in-the-loop long-horizon planner to perform task planning while catering +to a user's preference for peeling coverage, and a compliant controller to peel +the food items. We demonstrate the system on 12 food items representing the +extremes of different shapes, sizes, skin thickness, surface textures, skin vs +flesh colors, and deformability. + +
+
+
+
+
+ + ☆ Learning Strategies For Successful Crowd Navigation + + +
+ Teaching autonomous mobile robots to successfully navigate human crowds is a +challenging task. Not only does it require planning, but it requires +maintaining social norms which may differ from one context to another. Here we +focus on crowd navigation, using a neural network to learn specific strategies +in-situ with a robot. This allows us to take into account human behavior and +reactions toward a real robot as well as learn strategies that are specific to +various scenarios in that context. A CNN takes a top-down image of the scene as +input and outputs the next action for the robot to take in terms of speed and +angle. Here we present the method, experimental results, and quantitatively +evaluate our approach. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Towards Large-Scale Incremental Dense Mapping using Robot-centric + Implicit Neural Representation + + +
+ Large-scale dense mapping is vital in robotics, digital twins, and virtual +reality. Recently, implicit neural mapping has shown remarkable reconstruction +quality. However, incremental large-scale mapping with implicit neural +representations remains problematic due to low efficiency, limited video +memory, and the catastrophic forgetting phenomenon. To counter these +challenges, we introduce the Robot-centric Implicit Mapping (RIM) technique for +large-scale incremental dense mapping. This method employs a hybrid +representation, encoding shapes with implicit features via a multi-resolution +voxel map and decoding signed distance fields through a shallow MLP. We +advocate for a robot-centric local map to boost model training efficiency and +curb the catastrophic forgetting issue. A decoupled scalable global map is +further developed to archive learned features for reuse and maintain constant +video memory consumption. Validation experiments demonstrate our method's +exceptional quality, efficiency, and adaptability across diverse scales and +scenes over advanced dense mapping methods using range sensors. Our system's +code will be accessible at https://github.com/HITSZ-NRSL/RIM.git. + +
+
+
+
+
+ + ♻ ☆ Fast and Adaptive Multi-agent Planning under Collaborative Temporal + Logic Tasks via Poset Products + + +
+ Efficient coordination and planning is essential for large-scale multi-agent +systems that collaborate in a shared dynamic environment. Heuristic search +methods or learning-based approaches often lack the guarantee on correctness +and performance. Moreover, when the collaborative tasks contain both spatial +and temporal requirements, e.g., as Linear Temporal Logic (LTL) formulas, +formal methods provide a verifiable framework for task planning. However, since +the planning complexity grows exponentially with the number of agents and the +length of the task formula, existing studies are mostly limited to small +artificial cases. To address this issue, a new planning paradigm is proposed in +this work for system-wide temporal task formulas that are released online and +continually. It avoids two common bottlenecks in the traditional methods, i.e., +(i) the direct translation of the complete task formula to the associated +B\"uchi automaton; and (ii) the synchronized product between the B\"uchi +automaton and the transition models of all agents. Instead, an adaptive +planning algorithm is proposed that computes the product of relaxed +partially-ordered sets (R-posets) on-the-fly, and assigns these subtasks to the +agents subject to the ordering constraints. It is shown that the first valid +plan can be derived with a polynomial time and memory complexity w.r.t. the +system size and the formula length. Our method can take into account task +formulas with a length of more than 400 and a fleet with more than $400$ +agents, while most existing methods fail at the formula length of 25 within a +reasonable duration. The proposed method is validated on large fleets of +service robots in both simulation and hardware experiments. + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Industrial Application of 6D Pose Estimation for Robotic Manipulation in + Automotive Internal Logistics + + +
+ Despite the advances in robotics a large proportion of the of parts handling +tasks in the automotive industry's internal logistics are not automated but +still performed by humans. A key component to competitively automate these +processes is a 6D pose estimation that can handle a large number of different +parts, is adaptable to new parts with little manual effort, and is sufficiently +accurate and robust with respect to industry requirements. In this context, the +question arises as to the current status quo with respect to these measures. To +address this we built a representative 6D pose estimation pipeline with +state-of-the-art components from economically scalable real to synthetic data +generation to pose estimators and evaluated it on automotive parts with regards +to a realistic sequencing process. We found that using the data generation +approaches, the performance of the trained 6D pose estimators are promising, +but do not meet industry requirements. We reveal that the reason for this is +the inability of the estimators to provide reliable uncertainties for their +poses, rather than the ability of to provide sufficiently accurate poses. In +this context we further analyzed how RGB- and RGB-D-based approaches compare +against this background and show that they are differently vulnerable to the +domain gap induced by synthetic data. + +
+
+ comment: Accepted for publication at IEEE International Conference on + Automation Science and Engineering (CASE 2023) +
+
+
+
+
+ + ♻ ☆ Multi-AGV Path Planning Method via Reinforcement Learning and Particle + Filters + + +
+ The Reinforcement Learning (RL) algorithm, renowned for its robust learning +capability and search stability, has garnered significant attention and found +extensive application in Automated Guided Vehicle (AGV) path planning. However, +RL planning algorithms encounter challenges stemming from the substantial +variance of neural networks caused by environmental instability and significant +fluctuations in system structure. These challenges manifest in slow convergence +speed and low learning efficiency. To tackle this issue, this paper presents +the Particle Filter-Double Deep Q-Network (PF-DDQN) approach, which +incorporates the Particle Filter (PF) into multi-AGV reinforcement learning +path planning. The PF-DDQN method leverages the imprecise weight values of the +network as state values to formulate the state space equation. Through the +iterative fusion process of neural networks and particle filters, the DDQN +model is optimized to acquire the optimal true weight values, thus enhancing +the algorithm's efficiency. The proposed method's effectiveness and superiority +are validated through numerical simulations. Overall, the simulation results +demonstrate that the proposed algorithm surpasses the traditional DDQN +algorithm in terms of path planning superiority and training time indicators by +92.62% and 76.88%, respectively. In conclusion, the PF-DDQN method addresses +the challenges encountered by RL planning algorithms in AGV path planning. By +integrating the Particle Filter and optimizing the DDQN model, the proposed +method achieves enhanced efficiency and outperforms the traditional DDQN +algorithm in terms of path planning superiority and training time indicators. + +
+
+
+
+
+ + ♻ ☆ Scalable 3D Registration via Truncated Entry-wise Absolute Residuals CVPR 2024 + + +
+ Given an input set of $3$D point pairs, the goal of outlier-robust $3$D +registration is to compute some rotation and translation that align as many +point pairs as possible. This is an important problem in computer vision, for +which many highly accurate approaches have been recently proposed. Despite +their impressive performance, these approaches lack scalability, often +overflowing the $16$GB of memory of a standard laptop to handle roughly +$30,000$ point pairs. In this paper, we propose a $3$D registration approach +that can process more than ten million ($10^7$) point pairs with over $99\%$ +random outliers. Moreover, our method is efficient, entails low memory costs, +and maintains high accuracy at the same time. We call our method TEAR, as it +involves minimizing an outlier-robust loss that computes Truncated Entry-wise +Absolute Residuals. To minimize this loss, we decompose the original +$6$-dimensional problem into two subproblems of dimensions $3$ and $2$, +respectively, solved in succession to global optimality via a customized +branch-and-bound method. While branch-and-bound is often slow and unscalable, +this does not apply to TEAR as we propose novel bounding functions that are +tight and computationally efficient. Experiments on various datasets are +conducted to validate the scalability and efficiency of our method. + +
+
+ comment: 24 pages, 12 figures. Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ GeRM: A Generalist Robotic Model with Mixture-of-experts for Quadruped + Robot + + +
+ Multi-task robot learning holds significant importance in tackling diverse +and complex scenarios. However, current approaches are hindered by performance +issues and difficulties in collecting training datasets. In this paper, we +propose GeRM (Generalist Robotic Model). We utilize offline reinforcement +learning to optimize data utilization strategies to learn from both +demonstrations and sub-optimal data, thus surpassing the limitations of human +demonstrations. Thereafter, we employ a transformer-based VLA network to +process multi-modal inputs and output actions. By introducing the +Mixture-of-Experts structure, GeRM allows faster inference speed with higher +whole model capacity, and thus resolves the issue of limited RL parameters, +enhancing model performance in multi-task learning while controlling +computational costs. Through a series of experiments, we demonstrate that GeRM +outperforms other methods across all tasks, while also validating its +efficiency in both training and inference processes. Additionally, we uncover +its potential to acquire emergent skills. Additionally, we contribute the +QUARD-Auto dataset, collected automatically to support our training approach +and foster advancements in multi-task quadruped robot learning. This work +presents a new paradigm for reducing the cost of collecting robot data and +driving progress in the multi-task learning community. You can reach our +project and video through the link: https://songwxuan.github.io/GeRM/ . + +
+
+
+
+
+ + ♻ ☆ CC-VPSTO: Chance-Constrained Via-Point-based Stochastic Trajectory + Optimisation for Safe and Efficient Online Robot Motion Planning + + +
+ Safety in the face of uncertainty is a key challenge in robotics. We +introduce a real-time capable framework to generate safe and task-efficient +robot motions for stochastic control problems. We frame this as a +chance-constrained optimisation problem constraining the probability of the +controlled system to violate a safety constraint to be below a set threshold. +To estimate this probability we propose a Monte--Carlo approximation. We +suggest several ways to construct the problem given a fixed number of +uncertainty samples, such that it is a reliable over-approximation of the +original problem, i.e. any solution to the sample-based problem adheres to the +original chance-constraint with high confidence. To solve the resulting +problem, we integrate it into our motion planner VP-STO and name the enhanced +framework Chance-Constrained (CC)-VPSTO. The strengths of our approach lie in +i) its generality, without assumptions on the underlying uncertainty +distribution, system dynamics, cost function, or the form of inequality +constraints; and ii) its applicability to MPC-settings. We demonstrate the +validity and efficiency of our approach on both simulation and real-world robot +experiments. + +
+
+ comment: 17 pages, 11 figures, submitted to IEEE Transactions on Robotics +
+
+
+
+
+ + ♻ ☆ MPC-Inspired Reinforcement Learning for Verifiable Model-Free Control + + +
+ In this paper, we introduce a new class of parameterized controllers, drawing +inspiration from Model Predictive Control (MPC). The controller resembles a +Quadratic Programming (QP) solver of a linear MPC problem, with the parameters +of the controller being trained via Deep Reinforcement Learning (DRL) rather +than derived from system models. This approach addresses the limitations of +common controllers with Multi-Layer Perceptron (MLP) or other general neural +network architecture used in DRL, in terms of verifiability and performance +guarantees, and the learned controllers possess verifiable properties like +persistent feasibility and asymptotic stability akin to MPC. On the other hand, +numerical examples illustrate that the proposed controller empirically matches +MPC and MLP controllers in terms of control performance and has superior +robustness against modeling uncertainty and noises. Furthermore, the proposed +controller is significantly more computationally efficient compared to MPC and +requires fewer parameters to learn than MLP controllers. Real-world experiments +on vehicle drift maneuvering task demonstrate the potential of these +controllers for robotics and other demanding control tasks. + +
+
+
+
+
+ + ♻ ☆ Deadlock Resolution and Recursive Feasibility in MPC-based Multi-robot + Trajectory Generation + + +
+ Online collision-free trajectory generation within a shared workspace is +fundamental for most multi-robot applications. However, many widely-used +methods based on model predictive control (MPC) lack theoretical guarantees on +the feasibility of underlying optimization. Furthermore, when applied in a +distributed manner without a central coordinator, deadlocks often occur where +several robots block each other indefinitely. Whereas heuristic methods such as +introducing random perturbations exist, no profound analyses are given to +validate these measures. Towards this end, we propose a systematic method +called infinite-horizon model predictive control with deadlock resolution. The +MPC is formulated as a convex optimization over the proposed modified buffered +Voronoi with warning band. Based on this formulation, the condition of +deadlocks is formally analyzed and proven to be analogous to a force +equilibrium. A detection-resolution scheme is proposed, which can effectively +detect deadlocks online before they even happen. Once detected, it utilizes an +adaptive resolution scheme to resolve deadlocks, under which no stable +deadlocks can exist under minor conditions. In addition, the proposed planning +algorithm ensures recursive feasibility of the underlying optimization at each +time step under both input and model constraints, is concurrent for all robots +and requires only local communication. Comprehensive simulation and experiment +studies are conducted over large-scale multi-robot systems. Significant +improvements on success rate are reported, in comparison with other +state-of-the-art methods and especially in crowded and high-speed scenarios. + +
+
+ comment: 16 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Detecting and Mitigating System-Level Anomalies of Vision-Based + Controllers + + +
+ Autonomous systems, such as self-driving cars and drones, have made +significant strides in recent years by leveraging visual inputs and machine +learning for decision-making and control. Despite their impressive performance, +these vision-based controllers can make erroneous predictions when faced with +novel or out-of-distribution inputs. Such errors can cascade to catastrophic +system failures and compromise system safety. In this work, we introduce a +run-time anomaly monitor to detect and mitigate such closed-loop, system-level +failures. Specifically, we leverage a reachability-based framework to +stress-test the vision-based controller offline and mine its system-level +failures. This data is then used to train a classifier that is leveraged online +to flag inputs that might cause system breakdowns. The anomaly detector +highlights issues that transcend individual modules and pertain to the safety +of the overall system. We also design a fallback controller that robustly +handles these detected anomalies to preserve system safety. We validate the +proposed approach on an autonomous aircraft taxiing system that uses a +vision-based controller for taxiing. Our results show the efficacy of the +proposed approach in identifying and handling system-level anomalies, +outperforming methods such as prediction error-based detection, and ensembling, +thereby enhancing the overall safety and robustness of autonomous systems. + +
+
+
+
+
+ + ♻ ☆ EqDrive: Efficient Equivariant Motion Forecasting with Multi-Modality + for Autonomous Driving + + +
+ Forecasting vehicular motions in autonomous driving requires a deep +understanding of agent interactions and the preservation of motion equivariance +under Euclidean geometric transformations. Traditional models often lack the +sophistication needed to handle the intricate dynamics inherent to autonomous +vehicles and the interaction relationships among agents in the scene. As a +result, these models have a lower model capacity, which then leads to higher +prediction errors and lower training efficiency. In our research, we employ +EqMotion, a leading equivariant particle, and human prediction model that also +accounts for invariant agent interactions, for the task of multi-agent vehicle +motion forecasting. In addition, we use a multi-modal prediction mechanism to +account for multiple possible future paths in a probabilistic manner. By +leveraging EqMotion, our model achieves state-of-the-art (SOTA) performance +with fewer parameters (1.2 million) and a significantly reduced training time +(less than 2 hours). + +
+
+ comment: 6 pages, 7 figures, Accepted 2024 International Conference on + Robotics and Automation +
+
+
+
+
+ + ♻ ☆ Better Monocular 3D Detectors with LiDAR from the Past ICRA 2024 + + +
+ Accurate 3D object detection is crucial to autonomous driving. Though +LiDAR-based detectors have achieved impressive performance, the high cost of +LiDAR sensors precludes their widespread adoption in affordable vehicles. +Camera-based detectors are cheaper alternatives but often suffer inferior +performance compared to their LiDAR-based counterparts due to inherent depth +ambiguities in images. In this work, we seek to improve monocular 3D detectors +by leveraging unlabeled historical LiDAR data. Specifically, at inference time, +we assume that the camera-based detectors have access to multiple unlabeled +LiDAR scans from past traversals at locations of interest (potentially from +other high-end vehicles equipped with LiDAR sensors). Under this setup, we +proposed a novel, simple, and end-to-end trainable framework, termed +AsyncDepth, to effectively extract relevant features from asynchronous LiDAR +traversals of the same location for monocular 3D detectors. We show consistent +and significant performance gain (up to 9 AP) across multiple state-of-the-art +models and datasets with a negligible additional latency of 9.66 ms and a small +storage cost. + +
+
+ comment: Accepted by ICRA 2024. The code can be found at + https://github.com/YurongYou/AsyncDepth +
+
+
+
+
+ + ♻ ☆ JUICER: Data-Efficient Imitation Learning for Robotic Assembly + + +
+ While learning from demonstrations is powerful for acquiring visuomotor +policies, high-performance imitation without large demonstration datasets +remains challenging for tasks requiring precise, long-horizon manipulation. +This paper proposes a pipeline for improving imitation learning performance +with a small human demonstration budget. We apply our approach to assembly +tasks that require precisely grasping, reorienting, and inserting multiple +parts over long horizons and multiple task phases. Our pipeline combines +expressive policy architectures and various techniques for dataset expansion +and simulation-based data augmentation. These help expand dataset support and +supervise the model with locally corrective actions near bottleneck regions +requiring high precision. We demonstrate our pipeline on four furniture +assembly tasks in simulation, enabling a manipulator to assemble up to five +parts over nearly 2500 time steps directly from RGB images, outperforming +imitation and data augmentation baselines. Project website: +https://imitation-juicer.github.io/. + +
+
+ comment: Project website: https://imitation-juicer.github.io/ +
+
+
+
+
+ + ♻ ☆ Primal-Dual iLQR + + +
+ We introduce a new algorithm for solving unconstrained discrete-time optimal +control problems. Our method follows a direct multiple shooting approach, and +consists of applying the SQP method together with an $\ell_2$ augmented +Lagrangian primal-dual merit function. We use the LQR algorithm to efficiently +solve the primal-dual Newton-KKT system. As our algorithm is a specialization +of NPSQP, it inherits its generic properties, including global convergence, +fast local convergence, and the lack of need for second order corrections or +dimension expansions, improving on existing direct multiple shooting approaches +such as acados, ALTRO, GNMS, FATROP, and FDDP. As our algorithm avoids +sequential rollouts of the nonlinear dynamics, it can be combined with +(S\"arkk\"a and Garc\'ia-Fern\'andez, 2023) to run in $O(\log(N))$ parallel +time per iteration (where $N$ is the number of stages), as well as $O(1)$ +parallel time per line search iteration. Therefore, this paper provides a +practical, theoretically sound, and highly parallelizable (for example, with a +GPU) method for solving nonlinear discrete-time optimal control problems. + +
+
+ comment: 8 pages, 1 figure, 1 table +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 170 + +
+
+
+ + ☆ InternLM-XComposer2-4KHD: A Pioneering Large Vision-Language Model + Handling Resolutions from 336 Pixels to 4K HD + + +
+ The Large Vision-Language Model (LVLM) field has seen significant +advancements, yet its progression has been hindered by challenges in +comprehending fine-grained visual content due to limited resolution. Recent +efforts have aimed to enhance the high-resolution understanding capabilities of +LVLMs, yet they remain capped at approximately 1500 x 1500 pixels and +constrained to a relatively narrow resolution range. This paper represents +InternLM-XComposer2-4KHD, a groundbreaking exploration into elevating LVLM +resolution capabilities up to 4K HD (3840 x 1600) and beyond. Concurrently, +considering the ultra-high resolution may not be necessary in all scenarios, it +supports a wide range of diverse resolutions from 336 pixels to 4K standard, +significantly broadening its scope of applicability. Specifically, this +research advances the patch division paradigm by introducing a novel extension: +dynamic resolution with automatic patch configuration. It maintains the +training image aspect ratios while automatically varying patch counts and +configuring layouts based on a pre-trained Vision Transformer (ViT) (336 x +336), leading to dynamic training resolution from 336 pixels to 4K standard. +Our research demonstrates that scaling training resolution up to 4K HD leads to +consistent performance enhancements without hitting the ceiling of potential +improvements. InternLM-XComposer2-4KHD shows superb capability that matches or +even surpasses GPT-4V and Gemini Pro in 10 of the 16 benchmarks. The +InternLM-XComposer2-4KHD model series with 7B parameters are publicly available +at https://github.com/InternLM/InternLM-XComposer. + +
+
+ comment: Code and models are publicly available at + https://github.com/InternLM/InternLM-XComposer +
+
+
+
+
+ + ☆ MoReVQA: Exploring Modular Reasoning Models for Video Question Answering CVPR 2024 + + +
+ This paper addresses the task of video question answering (videoQA) via a +decomposed multi-stage, modular reasoning framework. Previous modular methods +have shown promise with a single planning stage ungrounded in visual content. +However, through a simple and effective baseline, we find that such systems can +lead to brittle behavior in practice for challenging videoQA settings. Thus, +unlike traditional single-stage planning methods, we propose a multi-stage +system consisting of an event parser, a grounding stage, and a final reasoning +stage in conjunction with an external memory. All stages are training-free, and +performed using few-shot prompting of large models, creating interpretable +intermediate outputs at each stage. By decomposing the underlying planning and +task complexity, our method, MoReVQA, improves over prior work on standard +videoQA benchmarks (NExT-QA, iVQA, EgoSchema, ActivityNet-QA) with +state-of-the-art results, and extensions to related tasks (grounded videoQA, +paragraph captioning). + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Can Feedback Enhance Semantic Grounding in Large Vision-Language Models? + + +
+ Enhancing semantic grounding abilities in Vision-Language Models (VLMs) often +involves collecting domain-specific training data, refining the network +architectures, or modifying the training recipes. In this work, we venture into +an orthogonal direction and explore whether VLMs can improve their semantic +grounding by "receiving" feedback, without requiring in-domain data, +fine-tuning, or modifications to the network architectures. We systematically +analyze this hypothesis using a feedback mechanism composed of a binary signal. +We find that if prompted appropriately, VLMs can utilize feedback both in a +single step and iteratively, showcasing the potential of feedback as an +alternative technique to improve grounding in internet-scale VLMs. Furthermore, +VLMs, like LLMs, struggle to self-correct errors out-of-the-box. However, we +find that this issue can be mitigated via a binary verification mechanism. +Finally, we explore the potential and limitations of amalgamating these +findings and applying them iteratively to automatically enhance VLMs' grounding +performance, showing grounding accuracy consistently improves using automated +feedback across all models in all settings investigated. Overall, our iterative +framework improves semantic grounding in VLMs by more than 15 accuracy points +under noise-free feedback and up to 5 accuracy points under a simple automated +binary verification mechanism. The project website is hosted at +https://andrewliao11.github.io/vlms_feedback + +
+
+ comment: 31 pages, 15 figures +
+
+
+
+
+ + ☆ Reconstructing Hand-Held Objects in 3D + + +
+ Objects manipulated by the hand (i.e., manipulanda) are particularly +challenging to reconstruct from in-the-wild RGB images or videos. Not only does +the hand occlude much of the object, but also the object is often only visible +in a small number of image pixels. At the same time, two strong anchors emerge +in this setting: (1) estimated 3D hands help disambiguate the location and +scale of the object, and (2) the set of manipulanda is small relative to all +possible objects. With these insights in mind, we present a scalable paradigm +for handheld object reconstruction that builds on recent breakthroughs in large +language/vision models and 3D object datasets. Our model, MCC-Hand-Object +(MCC-HO), jointly reconstructs hand and object geometry given a single RGB +image and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve +a 3D object model that matches the object in the image and rigidly align the +model to the network-inferred geometry; we call this alignment +Retrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO +achieves state-of-the-art performance on lab and Internet datasets, and we show +how RAR can be used to automatically obtain 3D labels for in-the-wild images of +hand-object interactions. + +
+
+
+
+
+ + ☆ Flying With Photons: Rendering Novel Views of Propagating Light + + +
+ We present an imaging and neural rendering technique that seeks to synthesize +videos of light propagating through a scene from novel, moving camera +viewpoints. Our approach relies on a new ultrafast imaging setup to capture a +first-of-its kind, multi-viewpoint video dataset with picosecond-level temporal +resolution. Combined with this dataset, we introduce an efficient neural volume +rendering framework based on the transient field. This field is defined as a +mapping from a 3D point and 2D direction to a high-dimensional, discrete-time +signal that represents time-varying radiance at ultrafast timescales. Rendering +with transient fields naturally accounts for effects due to the finite speed of +light, including viewpoint-dependent appearance changes caused by light +propagation delays to the camera. We render a range of complex effects, +including scattering, specular reflection, refraction, and diffraction. +Additionally, we demonstrate removing viewpoint-dependent propagation delays +using a time warping procedure, rendering of relativistic effects, and video +synthesis of direct and global components of light transport. + +
+
+ comment: Project page: https://anaghmalik.com/FlyingWithPhotons/ +
+
+
+
+
+ + ☆ RhythmMamba: Fast Remote Physiological Measurement with Arbitrary Length + Videos + + +
+ Remote photoplethysmography (rPPG) is a non-contact method for detecting +physiological signals from facial videos, holding great potential in various +applications such as healthcare, affective computing, and anti-spoofing. +Existing deep learning methods struggle to address two core issues of rPPG +simultaneously: extracting weak rPPG signals from video segments with large +spatiotemporal redundancy and understanding the periodic patterns of rPPG among +long contexts. This represents a trade-off between computational complexity and +the ability to capture long-range dependencies, posing a challenge for rPPG +that is suitable for deployment on mobile devices. Based on the in-depth +exploration of Mamba's comprehension of spatial and temporal information, this +paper introduces RhythmMamba, an end-to-end Mamba-based method that employs +multi-temporal Mamba to constrain both periodic patterns and short-term trends, +coupled with frequency domain feed-forward to enable Mamba to robustly +understand the quasi-periodic patterns of rPPG. Extensive experiments show that +RhythmMamba achieves state-of-the-art performance with reduced parameters and +lower computational complexity. The proposed RhythmMamba can be applied to +video segments of any length without performance degradation. The codes are +available at https://github.com/zizheng-guo/RhythmMamba. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2402.12788 +
+
+
+
+
+ + ☆ Text-Based Reasoning About Vector Graphics + + +
+ While large multimodal models excel in broad vision-language benchmarks, they +often struggle with tasks requiring precise perception of low-level visual +details, such as comparing line lengths or solving simple mazes. In particular, +this failure mode persists in question-answering tasks about vector graphics -- +images composed purely of 2D objects and shapes. To address this challenge, we +propose the Visually Descriptive Language Model (VDLM), which performs +text-based reasoning about vector graphics. VDLM leverages Scalable Vector +Graphics (SVG) for a more precise visual description and first uses an +off-the-shelf raster-to-SVG algorithm for encoding. Since existing language +models cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG +with pretrained language models through a newly introduced intermediate +symbolic representation, Primal Visual Description (PVD), comprising primitive +attributes (e.g., shape, position, measurement) with their corresponding +predicted values. PVD is task-agnostic and represents visual primitives that +are universal across all vector graphics. It can be learned with procedurally +generated (SVG, PVD) pairs and also enables the direct use of LLMs for +generalization to complex reasoning tasks. By casting an image to a text-based +representation, we can leverage the power of language models to learn alignment +from SVG to visual primitives and generalize to unseen question-answering +tasks. Empirical results show that VDLM achieves stronger zero-shot performance +compared to state-of-the-art LMMs, such as GPT-4V, in various low-level +multimodal perception and reasoning tasks on vector graphics. We additionally +present extensive analyses on VDLM's performance, demonstrating that our +framework offers better interpretability due to its disentangled perception and +reasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/ + +
+
+ comment: Project page: https://mikewangwzhl.github.io/VDLM/ +
+
+
+
+
+ + ☆ Learning State-Invariant Representations of Objects from Image + Collections with State, Pose, and Viewpoint Changes + + +
+ We add one more invariance - state invariance - to the more commonly used +other invariances for learning object representations for recognition and +retrieval. By state invariance, we mean robust with respect to changes in the +structural form of the object, such as when an umbrella is folded, or when an +item of clothing is tossed on the floor. Since humans generally have no +difficulty in recognizing objects despite such state changes, we are naturally +faced with the question of whether it is possible to devise a neural +architecture with similar abilities. To that end, we present a novel dataset, +ObjectsWithStateChange, that captures state and pose variations in the object +images recorded from arbitrary viewpoints. We believe that this dataset will +facilitate research in fine-grained object recognition and retrieval of objects +that are capable of state changes. The goal of such research would be to train +models capable of generating object embeddings that remain invariant to state +changes while also staying invariant to transformations induced by changes in +viewpoint, pose, illumination, etc. To demonstrate the usefulness of the +ObjectsWithStateChange dataset, we also propose a curriculum learning strategy +that uses the similarity relationships in the learned embedding space after +each epoch to guide the training process. The model learns discriminative +features by comparing visually similar objects within and across different +categories, encouraging it to differentiate between objects that may be +challenging to distinguish due to changes in their state. We believe that this +strategy enhances the model's ability to capture discriminative features for +fine-grained tasks that may involve objects with state changes, leading to +performance improvements on object-level tasks not only on our new dataset, but +also on two other challenging multi-view datasets such as ModelNet40 and +ObjectPI. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ A comparative analysis of deep learning models for lung segmentation on + X-ray images + + +
+ Robust and highly accurate lung segmentation in X-rays is crucial in medical +imaging. This study evaluates deep learning solutions for this task, ranking +existing methods and analyzing their performance under diverse image +modifications. Out of 61 analyzed papers, only nine offered implementation or +pre-trained models, enabling assessment of three prominent methods: Lung VAE, +TransResUNet, and CE-Net. The analysis revealed that CE-Net performs best, +demonstrating the highest values in dice similarity coefficient and +intersection over union metric. + +
+
+ comment: published at the Polish Conference on Artificial Intelligence + (PP-RAI), 2024 +
+
+
+
+
+ + ☆ PURE: Turning Polysemantic Neurons Into Pure Features by Identifying + Relevant Circuits + + +
+ The field of mechanistic interpretability aims to study the role of +individual neurons in Deep Neural Networks. Single neurons, however, have the +capability to act polysemantically and encode for multiple (unrelated) +features, which renders their interpretation difficult. We present a method for +disentangling polysemanticity of any Deep Neural Network by decomposing a +polysemantic neuron into multiple monosemantic "virtual" neurons. This is +achieved by identifying the relevant sub-graph ("circuit") for each "pure" +feature. We demonstrate how our approach allows us to find and disentangle +various polysemantic units of ResNet models trained on ImageNet. While +evaluating feature visualizations using CLIP, our method effectively +disentangles representations, improving upon methods based on neuron +activations. Our code is available at https://github.com/maxdreyer/PURE. + +
+
+ comment: 14 pages (4 pages manuscript, 2 pages references, 8 pages appendix) +
+
+
+
+
+ + SmartControl: Enhancing ControlNet for Handling Rough Visual Conditions + + +
+ Human visual imagination usually begins with analogies or rough sketches. For +example, given an image with a girl playing guitar before a building, one may +analogously imagine how it seems like if Iron Man playing guitar before Pyramid +in Egypt. Nonetheless, visual condition may not be precisely aligned with the +imaginary result indicated by text prompt, and existing layout-controllable +text-to-image (T2I) generation models is prone to producing degraded generated +results with obvious artifacts. To address this issue, we present a novel T2I +generation method dubbed SmartControl, which is designed to modify the rough +visual conditions for adapting to text prompt. The key idea of our SmartControl +is to relax the visual condition on the areas that are conflicted with text +prompts. In specific, a Control Scale Predictor (CSP) is designed to identify +the conflict regions and predict the local control scales, while a dataset with +text prompts and rough visual conditions is constructed for training CSP. It is +worth noting that, even with a limited number (e.g., 1,000~2,000) of training +samples, our SmartControl can generalize well to unseen objects. Extensive +experiments on four typical visual condition types clearly show the efficacy of +our SmartControl against state-of-the-arts. Source code, pre-trained models, +and datasets are available at https://github.com/liuxiaoyu1104/SmartControl. + +
+
+
+
+
+ + ☆ The Central Spanning Tree Problem + + +
+ Spanning trees are an important primitive in many data analysis tasks, when a +data set needs to be summarized in terms of its "skeleton", or when a +tree-shaped graph over all observations is required for downstream processing. +Popular definitions of spanning trees include the minimum spanning tree and the +optimum distance spanning tree, a.k.a. the minimum routing cost tree. When +searching for the shortest spanning tree but admitting additional branching +points, even shorter spanning trees can be realized: Steiner trees. +Unfortunately, both minimum spanning and Steiner trees are not robust with +respect to noise in the observations; that is, small perturbations of the +original data set often lead to drastic changes in the associated spanning +trees. In response, we make two contributions when the data lies in a Euclidean +space: on the theoretical side, we introduce a new optimization problem, the +"(branched) central spanning tree", which subsumes all previously mentioned +definitions as special cases. On the practical side, we show empirically that +the (branched) central spanning tree is more robust to noise in the data, and +as such is better suited to summarize a data set in terms of its skeleton. We +also propose a heuristic to address the NP-hard optimization problem, and +illustrate its use on single cell RNA expression data from biology and 3D point +clouds of plants. + +
+
+
+
+
+ + ☆ Multi-scale Dynamic and Hierarchical Relationship Modeling for Facial + Action Units Recognition CVPR2024 + + +
+ Human facial action units (AUs) are mutually related in a hierarchical +manner, as not only they are associated with each other in both spatial and +temporal domains but also AUs located in the same/close facial regions show +stronger relationships than those of different facial regions. While none of +existing approach thoroughly model such hierarchical inter-dependencies among +AUs, this paper proposes to comprehensively model multi-scale AU-related +dynamic and hierarchical spatio-temporal relationship among AUs for their +occurrences recognition. Specifically, we first propose a novel multi-scale +temporal differencing network with an adaptive weighting block to explicitly +capture facial dynamics across frames at different spatial scales, which +specifically considers the heterogeneity of range and magnitude in different +AUs' activation. Then, a two-stage strategy is introduced to hierarchically +model the relationship among AUs based on their spatial distribution (i.e., +local and cross-region AU relationship modelling). Experimental results +achieved on BP4D and DISFA show that our approach is the new state-of-the-art +in the field of AU occurrence recognition. Our code is publicly available at +https://github.com/CVI-SZU/MDHR. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ QueSTMaps: Queryable Semantic Topological Maps for 3D Scene + Understanding + + +
+ Understanding the structural organisation of 3D indoor scenes in terms of +rooms is often accomplished via floorplan extraction. Robotic tasks such as +planning and navigation require a semantic understanding of the scene as well. +This is typically achieved via object-level semantic segmentation. However, +such methods struggle to segment out topological regions like "kitchen" in the +scene. In this work, we introduce a two-step pipeline. First, we extract a +topological map, i.e., floorplan of the indoor scene using a novel +multi-channel occupancy representation. Then, we generate CLIP-aligned features +and semantic labels for every room instance based on the objects it contains +using a self-attention transformer. Our language-topology alignment supports +natural language querying, e.g., a "place to cook" locates the "kitchen". We +outperform the current state-of-the-art on room segmentation by ~20% and room +classification by ~12%. Our detailed qualitative analysis and ablation studies +provide insights into the problem of joint structural and semantic 3D scene +understanding. + +
+
+
+
+
+ + ☆ Seasonal Fire Prediction using Spatio-Temporal Deep Neural Networks + + +
+ With climate change expected to exacerbate fire weather conditions, the +accurate anticipation of wildfires on a global scale becomes increasingly +crucial for disaster mitigation. In this study, we utilize SeasFire, a +comprehensive global wildfire dataset with climate, vegetation, oceanic +indices, and human-related variables, to enable seasonal wildfire forecasting +with machine learning. For the predictive analysis, we train deep learning +models with different architectures that capture the spatio-temporal context +leading to wildfires. Our investigation focuses on assessing the effectiveness +of these models in predicting the presence of burned areas at varying +forecasting time horizons globally, extending up to six months into the future, +and on how different spatial or/and temporal context affects the performance of +the models. Our findings demonstrate the great potential of deep learning +models in seasonal fire forecasting; longer input time-series leads to more +robust predictions across varying forecasting horizons, while integrating +spatial information to capture wildfire spatio-temporal dynamics boosts +performance. Finally, our results hint that in order to enhance performance at +longer forecasting horizons, a larger receptive field spatially needs to be +considered. + +
+
+
+
+
+ + ☆ pfl-research: simulation framework for accelerating research in Private + Federated Learning + + +
+ Federated learning (FL) is an emerging machine learning (ML) training +paradigm where clients own their data and collaborate to train a global model, +without revealing any data to the server and other participants. Researchers +commonly perform experiments in a simulation environment to quickly iterate on +ideas. However, existing open-source tools do not offer the efficiency required +to simulate FL on larger and more realistic FL datasets. We introduce +pfl-research, a fast, modular, and easy-to-use Python framework for simulating +FL. It supports TensorFlow, PyTorch, and non-neural network models, and is +tightly integrated with state-of-the-art privacy algorithms. We study the speed +of open-source FL frameworks and show that pfl-research is 7-72$\times$ faster +than alternative open-source frameworks on common cross-device setups. Such +speedup will significantly boost the productivity of the FL research community +and enable testing hypotheses on realistic FL datasets that were previously too +resource intensive. We release a suite of benchmarks that evaluates an +algorithm's overall performance on a diverse set of realistic scenarios. The +code is available on GitHub at https://github.com/apple/pfl-research. + +
+
+
+
+
+ + ☆ Magic-Boost: Boost 3D Generation with Mutli-View Conditioned Diffusion + + +
+ Benefiting from the rapid development of 2D diffusion models, 3D content +creation has made significant progress recently. One promising solution +involves the fine-tuning of pre-trained 2D diffusion models to harness their +capacity for producing multi-view images, which are then lifted into accurate +3D models via methods like fast-NeRFs or large reconstruction models. However, +as inconsistency still exists and limited generated resolution, the generation +results of such methods still lack intricate textures and complex geometries. +To solve this problem, we propose Magic-Boost, a multi-view conditioned +diffusion model that significantly refines coarse generative results through a +brief period of SDS optimization ($\sim15$min). Compared to the previous text +or single image based diffusion models, Magic-Boost exhibits a robust +capability to generate images with high consistency from pseudo synthesized +multi-view images. It provides precise SDS guidance that well aligns with the +identity of the input images, enriching the local detail in both geometry and +texture of the initial generative results. Extensive experiments show +Magic-Boost greatly enhances the coarse inputs and generates high-quality 3D +assets with rich geometric and textural details. (Project Page: +https://magic-research.github.io/magic-boost/) + +
+
+
+
+
+ + ☆ ZeST: Zero-Shot Material Transfer from a Single Image + + +
+ We propose ZeST, a method for zero-shot material transfer to an object in the +input image given a material exemplar image. ZeST leverages existing diffusion +adapters to extract implicit material representation from the exemplar image. +This representation is used to transfer the material using pre-trained +inpainting diffusion model on the object in the input image using depth +estimates as geometry cue and grayscale object shading as illumination cues. +The method works on real images without any training resulting a zero-shot +approach. Both qualitative and quantitative results on real and synthetic +datasets demonstrate that ZeST outputs photorealistic images with transferred +materials. We also show the application of ZeST to perform multiple edits and +robust material assignment under different illuminations. Project Page: +https://ttchengab.github.io/zest + +
+
+ comment: Project Page: https://ttchengab.github.io/zest +
+
+
+
+
+ + ☆ Emergent Dynamics in Neural Cellular Automata + + +
+ Neural Cellular Automata (NCA) models are trainable variations of traditional +Cellular Automata (CA). Emergent motion in the patterns created by NCA has been +successfully applied to synthesize dynamic textures. However, the conditions +required for an NCA to display dynamic patterns remain unexplored. Here, we +investigate the relationship between the NCA architecture and the emergent +dynamics of the trained models. Specifically, we vary the number of channels in +the cell state and the number of hidden neurons in the MultiLayer Perceptron +(MLP), and draw a relationship between the combination of these two variables +and the motion strength between successive frames. Our analysis reveals that +the disparity and proportionality between these two variables have a strong +correlation with the emergent dynamics in the NCA output. We thus propose a +design principle for creating dynamic NCA. + +
+
+ comment: 2 pages +
+
+
+
+
+ + ☆ Raster Forge: Interactive Raster Manipulation Library and GUI for Python + + +
+ Raster Forge is a Python library and graphical user interface for raster data +manipulation and analysis. The tool is focused on remote sensing applications, +particularly in wildfire management. It allows users to import, visualize, and +process raster layers for tasks such as image compositing or topographical +analysis. For wildfire management, it generates fuel maps using predefined +models. Its impact extends from disaster management to hydrological modeling, +agriculture, and environmental monitoring. Raster Forge can be a valuable asset +for geoscientists and researchers who rely on raster data analysis, enhancing +geospatial data processing and visualization across various disciplines. + +
+
+
+
+
+ + ☆ VISION2UI: A Real-World Dataset with Layout for Code Generation from UI + Designs + + +
+ Automatically generating UI code from webpage design visions can +significantly alleviate the burden of developers, enabling beginner developers +or designers to directly generate Web pages from design diagrams. Currently, +prior research has accomplished the objective of generating UI code from +rudimentary design visions or sketches through designing deep neural networks. +Inspired by the groundbreaking advancements achieved by Multimodal Large +Language Models (MLLMs), the automatic generation of UI code from high-fidelity +design images is now emerging as a viable possibility. Nevertheless, our +investigation reveals that existing MLLMs are hampered by the scarcity of +authentic, high-quality, and large-scale datasets, leading to unsatisfactory +performance in automated UI code generation. To mitigate this gap, we present a +novel dataset, termed VISION2UI, extracted from real-world scenarios, augmented +with comprehensive layout information, tailored specifically for finetuning +MLLMs in UI code generation. Specifically, this dataset is derived through a +series of operations, encompassing collecting, cleaning, and filtering of the +open-source Common Crawl dataset. In order to uphold its quality, a neural +scorer trained on labeled samples is utilized to refine the data, retaining +higher-quality instances. Ultimately, this process yields a dataset comprising +2,000 (Much more is coming soon) parallel samples encompassing design visions +and UI code. The dataset is available at +https://huggingface.co/datasets/xcodemind/vision2ui. + +
+
+
+
+
+ + ☆ Dynamic Resolution Guidance for Facial Expression Recognition + + +
+ Facial expression recognition (FER) is vital for human-computer interaction +and emotion analysis, yet recognizing expressions in low-resolution images +remains challenging. This paper introduces a practical method called Dynamic +Resolution Guidance for Facial Expression Recognition (DRGFER) to effectively +recognize facial expressions in images with varying resolutions without +compromising FER model accuracy. Our framework comprises two main components: +the Resolution Recognition Network (RRN) and the Multi-Resolution Adaptation +Facial Expression Recognition Network (MRAFER). The RRN determines image +resolution, outputs a binary vector, and the MRAFER assigns images to suitable +facial expression recognition networks based on resolution. We evaluated DRGFER +on widely-used datasets RAFDB and FERPlus, demonstrating that our method +retains optimal model performance at each resolution and outperforms +alternative resolution approaches. The proposed framework exhibits robustness +against resolution variations and facial expressions, offering a promising +solution for real-world applications. + +
+
+
+
+
+ + ☆ Test-Time Adaptation with SaLIP: A Cascade of SAM and CLIP for Zero shot + Medical Image Segmentation + + +
+ The Segment Anything Model (SAM) and CLIP are remarkable vision foundation +models (VFMs). SAM, a prompt driven segmentation model, excels in segmentation +tasks across diverse domains, while CLIP is renowned for its zero shot +recognition capabilities. However, their unified potential has not yet been +explored in medical image segmentation. To adapt SAM to medical imaging, +existing methods primarily rely on tuning strategies that require extensive +data or prior prompts tailored to the specific task, making it particularly +challenging when only a limited number of data samples are available. This work +presents an in depth exploration of integrating SAM and CLIP into a unified +framework for medical image segmentation. Specifically, we propose a simple +unified framework, SaLIP, for organ segmentation. Initially, SAM is used for +part based segmentation within the image, followed by CLIP to retrieve the mask +corresponding to the region of interest (ROI) from the pool of SAM generated +masks. Finally, SAM is prompted by the retrieved ROI to segment a specific +organ. Thus, SaLIP is training and fine tuning free and does not rely on domain +expertise or labeled data for prompt engineering. Our method shows substantial +enhancements in zero shot segmentation, showcasing notable improvements in DICE +scores across diverse segmentation tasks like brain (63.46%), lung (50.11%), +and fetal head (30.82%), when compared to un prompted SAM. Code and text +prompts will be available online. + +
+
+
+
+
+ + ☆ High Noise Scheduling is a Must + + +
+ Consistency models possess high capabilities for image generation, advancing +sampling steps to a single step through their advanced techniques. Current +advancements move one step forward consistency training techniques and +eliminates the limitation of distillation training. Even though the proposed +curriculum and noise scheduling in improved training techniques yield better +results than basic consistency models, it lacks well balanced noise +distribution and its consistency between curriculum. In this study, it is +investigated the balance between high and low noise levels in noise +distribution and offered polynomial noise distribution to maintain the +stability. This proposed polynomial noise distribution is also supported with a +predefined Karras noises to prevent unique noise levels arises with Karras +noise generation algorithm. Furthermore, by elimination of learned noisy steps +with a curriculum based on sinusoidal function increase the performance of the +model in denoising. To make a fair comparison with the latest released +consistency model training techniques, experiments are conducted with same +hyper-parameters except curriculum and noise distribution. The models utilized +during experiments are determined with low depth to prove the robustness of our +proposed technique. The results show that the polynomial noise distribution +outperforms the model trained with log-normal noise distribution, yielding a +33.54 FID score after 100,000 training steps with constant discretization +steps. Additionally, the implementation of a sinusoidal-based curriculum +enhances denoising performance, resulting in a FID score of 30.48. + +
+
+
+
+
+ + ☆ DaF-BEVSeg: Distortion-aware Fisheye Camera based Bird's Eye View + Segmentation with Occlusion Reasoning + + +
+ Semantic segmentation is an effective way to perform scene understanding. +Recently, segmentation in 3D Bird's Eye View (BEV) space has become popular as +its directly used by drive policy. However, there is limited work on BEV +segmentation for surround-view fisheye cameras, commonly used in commercial +vehicles. As this task has no real-world public dataset and existing synthetic +datasets do not handle amodal regions due to occlusion, we create a synthetic +dataset using the Cognata simulator comprising diverse road types, weather, and +lighting conditions. We generalize the BEV segmentation to work with any camera +model; this is useful for mixing diverse cameras. We implement a baseline by +applying cylindrical rectification on the fisheye images and using a standard +LSS-based BEV segmentation model. We demonstrate that we can achieve better +performance without undistortion, which has the adverse effects of increased +runtime due to pre-processing, reduced field-of-view, and resampling artifacts. +Further, we introduce a distortion-aware learnable BEV pooling strategy that is +more effective for the fisheye cameras. We extend the model with an occlusion +reasoning module, which is critical for estimating in BEV space. Qualitative +performance of DaF-BEVSeg is showcased in the video at +https://streamable.com/ge4v51. + +
+
+
+
+
+ + ☆ HPNet: Dynamic Trajectory Forecasting with Historical Prediction + Attention CVPR2024 + + +
+ Predicting the trajectories of road agents is essential for autonomous +driving systems. The recent mainstream methods follow a static paradigm, which +predicts the future trajectory by using a fixed duration of historical frames. +These methods make the predictions independently even at adjacent time steps, +which leads to potential instability and temporal inconsistency. As successive +time steps have largely overlapping historical frames, their forecasting should +have intrinsic correlation, such as overlapping predicted trajectories should +be consistent, or be different but share the same motion goal depending on the +road situation. Motivated by this, in this work, we introduce HPNet, a novel +dynamic trajectory forecasting method. Aiming for stable and accurate +trajectory forecasting, our method leverages not only historical frames +including maps and agent states, but also historical predictions. Specifically, +we newly design a Historical Prediction Attention module to automatically +encode the dynamic relationship between successive predictions. Besides, it +also extends the attention range beyond the currently visible window +benefitting from the use of historical predictions. The proposed Historical +Prediction Attention together with the Agent Attention and Mode Attention is +further formulated as the Triple Factorized Attention module, serving as the +core design of HPNet.Experiments on the Argoverse and INTERACTION datasets show +that HPNet achieves state-of-the-art performance, and generates accurate and +stable future trajectories. Our code are available at +https://github.com/XiaolongTang23/HPNet. + +
+
+ comment: accepted by CVPR2024 +
+
+
+
+
+ + ☆ Rolling Shutter Correction with Intermediate Distortion Flow Estimation CVPR2024 + + +
+ This paper proposes to correct the rolling shutter (RS) distorted images by +estimating the distortion flow from the global shutter (GS) to RS directly. +Existing methods usually perform correction using the undistortion flow from +the RS to GS. They initially predict the flow from consecutive RS frames, +subsequently rescaling it as the displacement fields from the RS frame to the +underlying GS image using time-dependent scaling factors. Following this, +RS-aware forward warping is employed to convert the RS image into its GS +counterpart. Nevertheless, this strategy is prone to two shortcomings. First, +the undistortion flow estimation is rendered inaccurate by merely linear +scaling the flow, due to the complex non-linear motion nature. Second, RS-aware +forward warping often results in unavoidable artifacts. To address these +limitations, we introduce a new framework that directly estimates the +distortion flow and rectifies the RS image with the backward warping operation. +More specifically, we first propose a global correlation-based flow attention +mechanism to estimate the initial distortion flow and GS feature jointly, which +are then refined by the following coarse-to-fine decoder layers. Additionally, +a multi-distortion flow prediction strategy is integrated to mitigate the issue +of inaccurate flow estimation further. Experimental results validate the +effectiveness of the proposed method, which outperforms state-of-the-art +approaches on various benchmarks while maintaining high efficiency. The project +is available at \url{https://github.com/ljzycmd/DFRSC}. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ Matching 2D Images in 3D: Metric Relative Pose from Metric + Correspondences + + +
+ Given two images, we can estimate the relative camera pose between them by +establishing image-to-image correspondences. Usually, correspondences are +2D-to-2D and the pose we estimate is defined only up to scale. Some +applications, aiming at instant augmented reality anywhere, require +scale-metric pose estimates, and hence, they rely on external depth estimators +to recover the scale. We present MicKey, a keypoint matching pipeline that is +able to predict metric correspondences in 3D camera space. By learning to match +3D coordinates across images, we are able to infer the metric relative pose +without depth measurements. Depth measurements are also not required for +training, nor are scene reconstructions or image overlap information. MicKey is +supervised only by pairs of images and their relative poses. MicKey achieves +state-of-the-art performance on the Map-Free Relocalisation benchmark while +requiring less supervision than competing approaches. + +
+
+
+
+
+ + ☆ Audio-Visual Generalized Zero-Shot Learning using Pre-Trained Large + Multi-Modal Models CVPR + + +
+ Audio-visual zero-shot learning methods commonly build on features extracted +from pre-trained models, e.g. video or audio classification models. However, +existing benchmarks predate the popularization of large multi-modal models, +such as CLIP and CLAP. In this work, we explore such large pre-trained models +to obtain features, i.e. CLIP for visual features, and CLAP for audio features. +Furthermore, the CLIP and CLAP text encoders provide class label embeddings +which are combined to boost the performance of the system. We propose a simple +yet effective model that only relies on feed-forward neural networks, +exploiting the strong generalization capabilities of the new audio, visual and +textual features. Our framework achieves state-of-the-art performance on +VGGSound-GZSL, UCF-GZSL, and ActivityNet-GZSL with our new features. Code and +data available at: https://github.com/dkurzend/ClipClap-GZSL. + +
+
+ comment: CVPRw 2024 (L3D-IVU) +
+
+
+
+
+ + ☆ Fortifying Fully Convolutional Generative Adversarial Networks for Image + Super-Resolution Using Divergence Measures + + +
+ Super-Resolution (SR) is a time-hallowed image processing problem that aims +to improve the quality of a Low-Resolution (LR) sample up to the standard of +its High-Resolution (HR) counterpart. We aim to address this by introducing +Super-Resolution Generator (SuRGe), a fully-convolutional Generative +Adversarial Network (GAN)-based architecture for SR. We show that distinct +convolutional features obtained at increasing depths of a GAN generator can be +optimally combined by a set of learnable convex weights to improve the quality +of generated SR samples. In the process, we employ the Jensen-Shannon and the +Gromov-Wasserstein losses respectively between the SR-HR and LR-SR pairs of +distributions to further aid the generator of SuRGe to better exploit the +available information in an attempt to improve SR. Moreover, we train the +discriminator of SuRGe with the Wasserstein loss with gradient penalty, to +primarily prevent mode collapse. The proposed SuRGe, as an end-to-end GAN +workflow tailor-made for super-resolution, offers improved performance while +maintaining low inference time. The efficacy of SuRGe is substantiated by its +superior performance compared to 18 state-of-the-art contenders on 10 benchmark +datasets. + +
+
+
+
+
+ + ☆ Counterfactual Reasoning for Multi-Label Image Classification via + Patching-Based Training + + +
+ The key to multi-label image classification (MLC) is to improve model +performance by leveraging label correlations. Unfortunately, it has been shown +that overemphasizing co-occurrence relationships can cause the overfitting +issue of the model, ultimately leading to performance degradation. In this +paper, we provide a causal inference framework to show that the correlative +features caused by the target object and its co-occurring objects can be +regarded as a mediator, which has both positive and negative impacts on model +predictions. On the positive side, the mediator enhances the recognition +performance of the model by capturing co-occurrence relationships; on the +negative side, it has the harmful causal effect that causes the model to make +an incorrect prediction for the target object, even when only co-occurring +objects are present in an image. To address this problem, we propose a +counterfactual reasoning method to measure the total direct effect, achieved by +enhancing the direct effect caused only by the target object. Due to the +unknown location of the target object, we propose patching-based training and +inference to accomplish this goal, which divides an image into multiple patches +and identifies the pivot patch that contains the target object. Experimental +results on multiple benchmark datasets with diverse configurations validate +that the proposed method can achieve state-of-the-art performance. + +
+
+
+
+
+ + ☆ NoiseNCA: Noisy Seed Improves Spatio-Temporal Continuity of Neural + Cellular Automata + + +
+ Neural Cellular Automata (NCA) is a class of Cellular Automata where the +update rule is parameterized by a neural network that can be trained using +gradient descent. In this paper, we focus on NCA models used for texture +synthesis, where the update rule is inspired by partial differential equations +(PDEs) describing reaction-diffusion systems. To train the NCA model, the +spatio-termporal domain is discretized, and Euler integration is used to +numerically simulate the PDE. However, whether a trained NCA truly learns the +continuous dynamic described by the corresponding PDE or merely overfits the +discretization used in training remains an open question. We study NCA models +at the limit where space-time discretization approaches continuity. We find +that existing NCA models tend to overfit the training discretization, +especially in the proximity of the initial condition, also called "seed". To +address this, we propose a solution that utilizes uniform noise as the initial +condition. We demonstrate the effectiveness of our approach in preserving the +consistency of NCA dynamics across a wide range of spatio-temporal +granularities. Our improved NCA model enables two new test-time interactions by +allowing continuous control over the speed of pattern formation and the scale +of the synthesized patterns. We demonstrate this new NCA feature in our +interactive online demo. Our work reveals that NCA models can learn continuous +dynamics and opens new venues for NCA research from a dynamical systems' +perspective. + +
+
+ comment: 9 pages, 12 figures +
+
+
+
+
+ + ☆ Learning Embeddings with Centroid Triplet Loss for Object Identification + in Robotic Grasping + + +
+ Foundation models are a strong trend in deep learning and computer vision. +These models serve as a base for applications as they require minor or no +further fine-tuning by developers to integrate into their applications. +Foundation models for zero-shot object segmentation such as Segment Anything +(SAM) output segmentation masks from images without any further object +information. When they are followed in a pipeline by an object identification +model, they can perform object detection without training. Here, we focus on +training such an object identification model. A crucial practical aspect for an +object identification model is to be flexible in input size. As object +identification is an image retrieval problem, a suitable method should handle +multi-query multi-gallery situations without constraining the number of input +images (e.g. by having fixed-size aggregation layers). The key solution to +train such a model is the centroid triplet loss (CTL), which aggregates image +features to their centroids. CTL yields high accuracy, avoids misleading +training signals and keeps the model input size flexible. In our experiments, +we establish a new state of the art on the ArmBench object identification task, +which shows general applicability of our model. We furthermore demonstrate an +integrated unseen object detection pipeline on the challenging HOPE dataset, +which requires fine-grained detection. There, our pipeline matches and +surpasses related methods which have been trained on dataset-specific data. + +
+
+
+
+
+ + ☆ Robust Confidence Intervals in Stereo Matching using Possibility Theory + + +
+ We propose a method for estimating disparity confidence intervals in stereo +matching problems. Confidence intervals provide complementary information to +usual confidence measures. To the best of our knowledge, this is the first +method creating disparity confidence intervals based on the cost volume. This +method relies on possibility distributions to interpret the epistemic +uncertainty of the cost volume. Our method has the benefit of having a +white-box nature, differing in this respect from current state-of-the-art deep +neural networks approaches. The accuracy and size of confidence intervals are +validated using the Middlebury stereo datasets as well as a dataset of +satellite images. This contribution is freely available on GitHub. + +
+
+
+
+
+ + ☆ 3D Geometry-aware Deformable Gaussian Splatting for Dynamic View + Synthesis CVPR 2024 + + +
+ In this paper, we propose a 3D geometry-aware deformable Gaussian Splatting +method for dynamic view synthesis. Existing neural radiance fields (NeRF) based +solutions learn the deformation in an implicit manner, which cannot incorporate +3D scene geometry. Therefore, the learned deformation is not necessarily +geometrically coherent, which results in unsatisfactory dynamic view synthesis +and 3D dynamic reconstruction. Recently, 3D Gaussian Splatting provides a new +representation of the 3D scene, building upon which the 3D geometry could be +exploited in learning the complex 3D deformation. Specifically, the scenes are +represented as a collection of 3D Gaussian, where each 3D Gaussian is optimized +to move and rotate over time to model the deformation. To enforce the 3D scene +geometry constraint during deformation, we explicitly extract 3D geometry +features and integrate them in learning the 3D deformation. In this way, our +solution achieves 3D geometry-aware deformation modeling, which enables +improved dynamic view synthesis and 3D dynamic reconstruction. Extensive +experimental results on both synthetic and real datasets prove the superiority +of our solution, which achieves new state-of-the-art performance. + The project is available at https://npucvr.github.io/GaGS/ + +
+
+ comment: Accepted by CVPR 2024. Project page: https://npucvr.github.io/GaGS/ +
+
+
+
+
+ + ☆ Spatial-Temporal Multi-level Association for Video Object Segmentation + + +
+ Existing semi-supervised video object segmentation methods either focus on +temporal feature matching or spatial-temporal feature modeling. However, they +do not address the issues of sufficient target interaction and efficient +parallel processing simultaneously, thereby constraining the learning of +dynamic, target-aware features. To tackle these limitations, this paper +proposes a spatial-temporal multi-level association framework, which jointly +associates reference frame, test frame, and object features to achieve +sufficient interaction and parallel target ID association with a +spatial-temporal memory bank for efficient video object segmentation. +Specifically, we construct a spatial-temporal multi-level feature association +module to learn better target-aware features, which formulates feature +extraction and interaction as the efficient operations of object +self-attention, reference object enhancement, and test reference correlation. +In addition, we propose a spatial-temporal memory to assist feature association +and temporal ID assignment and correlation. We evaluate the proposed method by +conducting extensive experiments on numerous video object segmentation +datasets, including DAVIS 2016/2017 val, DAVIS 2017 test-dev, and YouTube-VOS +2018/2019 val. The favorable performance against the state-of-the-art methods +demonstrates the effectiveness of our approach. All source code and trained +models will be made publicly available. + +
+
+
+
+
+ + ☆ Playing to Vision Foundation Model's Strengths in Stereo Matching + + +
+ Stereo matching has become a key technique for 3D environment perception in +intelligent vehicles. For a considerable time, convolutional neural networks +(CNNs) have remained the mainstream choice for feature extraction in this +domain. Nonetheless, there is a growing consensus that the existing paradigm +should evolve towards vision foundation models (VFM), particularly those +developed based on vision Transformers (ViTs) and pre-trained through +self-supervision on extensive, unlabeled datasets. While VFMs are adept at +extracting informative, general-purpose visual features, specifically for dense +prediction tasks, their performance often lacks in geometric vision tasks. This +study serves as the first exploration of a viable approach for adapting VFMs to +stereo matching. Our ViT adapter, referred to as ViTAS, is constructed upon +three types of modules: spatial differentiation, patch attention fusion, and +cross-attention. The first module initializes feature pyramids, while the +latter two aggregate stereo and multi-scale contextual information into +fine-grained features, respectively. ViTAStereo, which combines ViTAS with cost +volume-based stereo matching back-end processes, achieves the top rank on the +KITTI Stereo 2012 dataset and outperforms the second-best network StereoBase by +approximately 7.9% in terms of the percentage of error pixels, with a tolerance +of 3 pixels. Additional experiments across diverse scenarios further +demonstrate its superior generalizability compared to all other +state-of-the-art approaches. We believe this new paradigm will pave the way for +the next generation of stereo matching networks. + +
+
+
+
+
+ + ☆ Robust feature knowledge distillation for enhanced performance of + lightweight crack segmentation models + + +
+ Vision-based crack detection faces deployment challenges due to the size of +robust models and edge device limitations. These can be addressed with +lightweight models trained with knowledge distillation (KD). However, +state-of-the-art (SOTA) KD methods compromise anti-noise robustness. This paper +develops Robust Feature Knowledge Distillation (RFKD), a framework to improve +robustness while retaining the precision of light models for crack +segmentation. RFKD distils knowledge from a teacher model's logit layers and +intermediate feature maps while leveraging mixed clean and noisy images to +transfer robust patterns to the student model, improving its precision, +generalisation, and anti-noise performance. To validate the proposed RFKD, a +lightweight crack segmentation model, PoolingCrack Tiny (PCT), with only 0.5 M +parameters, is also designed and used as the student to run the framework. The +results show a significant enhancement in noisy images, with RFKD reaching a +62% enhanced mean Dice score (mDS) compared to SOTA KD methods. + +
+
+ comment: 24 pages, 13 figures +
+
+
+
+
+ + ☆ Label-Efficient 3D Object Detection For Road-Side Units + + +
+ Occlusion presents a significant challenge for safety-critical applications +such as autonomous driving. Collaborative perception has recently attracted a +large research interest thanks to the ability to enhance the perception of +autonomous vehicles via deep information fusion with intelligent roadside units +(RSU), thus minimizing the impact of occlusion. While significant advancement +has been made, the data-hungry nature of these methods creates a major hurdle +for their real-world deployment, particularly due to the need for annotated RSU +data. Manually annotating the vast amount of RSU data required for training is +prohibitively expensive, given the sheer number of intersections and the effort +involved in annotating point clouds. We address this challenge by devising a +label-efficient object detection method for RSU based on unsupervised object +discovery. Our paper introduces two new modules: one for object discovery based +on a spatial-temporal aggregation of point clouds, and another for refinement. +Furthermore, we demonstrate that fine-tuning on a small portion of annotated +data allows our object discovery models to narrow the performance gap with, or +even surpass, fully supervised models. Extensive experiments are carried out in +simulated and real-world datasets to evaluate our method. + +
+
+ comment: IV 2024 +
+
+
+
+
+ + ☆ From Barlow Twins to Triplet Training: Differentiating Dementia with + Limited Data + + +
+ Differential diagnosis of dementia is challenging due to overlapping +symptoms, with structural magnetic resonance imaging (MRI) being the primary +method for diagnosis. Despite the clinical value of computer-aided differential +diagnosis, research has been limited, mainly due to the absence of public +datasets that contain diverse types of dementia. This leaves researchers with +small in-house datasets that are insufficient for training deep neural networks +(DNNs). Self-supervised learning shows promise for utilizing unlabeled MRI +scans in training, but small batch sizes for volumetric brain scans make its +application challenging. To address these issues, we propose Triplet Training +for differential diagnosis with limited target data. It consists of three key +stages: (i) self-supervised pre-training on unlabeled data with Barlow Twins, +(ii) self-distillation on task-related data, and (iii) fine-tuning on the +target dataset. Our approach significantly outperforms traditional training +strategies, achieving a balanced accuracy of 75.6%. We further provide insights +into the training process by visualizing changes in the latent space after each +step. Finally, we validate the robustness of Triplet Training in terms of its +individual components in a comprehensive ablation study. Our code is available +at https://github.com/ai-med/TripletTraining. + +
+
+ comment: Accepted for presentation at MIDL 2024 +
+
+
+
+
+ + ☆ ColorMNet: A Memory-based Deep Spatial-Temporal Feature Propagation + Network for Video Colorization + + +
+ How to effectively explore spatial-temporal features is important for video +colorization. Instead of stacking multiple frames along the temporal dimension +or recurrently propagating estimated features that will accumulate errors or +cannot explore information from far-apart frames, we develop a memory-based +feature propagation module that can establish reliable connections with +features from far-apart frames and alleviate the influence of inaccurately +estimated features. To extract better features from each frame for the +above-mentioned feature propagation, we explore the features from +large-pretrained visual models to guide the feature estimation of each frame so +that the estimated features can model complex scenarios. In addition, we note +that adjacent frames usually contain similar contents. To explore this property +for better spatial and temporal feature utilization, we develop a local +attention module to aggregate the features from adjacent frames in a +spatial-temporal neighborhood. We formulate our memory-based feature +propagation module, large-pretrained visual model guided feature estimation +module, and local attention module into an end-to-end trainable network (named +ColorMNet) and show that it performs favorably against state-of-the-art methods +on both the benchmark datasets and real-world scenarios. The source code and +pre-trained models will be available at +\url{https://github.com/yyang181/colormnet}. + +
+
+ comment: Project website: \url{https://github.com/yyang181/colormnet} +
+
+
+
+
+ + ☆ LRR: Language-Driven Resamplable Continuous Representation against + Adversarial Tracking Attacks + + +
+ Visual object tracking plays a critical role in visual-based autonomous +systems, as it aims to estimate the position and size of the object of interest +within a live video. Despite significant progress made in this field, +state-of-the-art (SOTA) trackers often fail when faced with adversarial +perturbations in the incoming frames. This can lead to significant robustness +and security issues when these trackers are deployed in the real world. To +achieve high accuracy on both clean and adversarial data, we propose building a +spatial-temporal continuous representation using the semantic text guidance of +the object of interest. This novel continuous representation enables us to +reconstruct incoming frames to maintain semantic and appearance consistency +with the object of interest and its clean counterparts. As a result, our +proposed method successfully defends against different SOTA adversarial +tracking attacks while maintaining high accuracy on clean data. In particular, +our method significantly increases tracking accuracy under adversarial attacks +with around 90% relative improvement on UAV123, which is even higher than the +accuracy on clean data. + +
+
+
+
+
+ + ☆ GHNeRF: Learning Generalizable Human Features with Efficient Neural + Radiance Fields + + +
+ Recent advances in Neural Radiance Fields (NeRF) have demonstrated promising +results in 3D scene representations, including 3D human representations. +However, these representations often lack crucial information on the underlying +human pose and structure, which is crucial for AR/VR applications and games. In +this paper, we introduce a novel approach, termed GHNeRF, designed to address +these limitations by learning 2D/3D joint locations of human subjects with NeRF +representation. GHNeRF uses a pre-trained 2D encoder streamlined to extract +essential human features from 2D images, which are then incorporated into the +NeRF framework in order to encode human biomechanic features. This allows our +network to simultaneously learn biomechanic features, such as joint locations, +along with human geometry and texture. To assess the effectiveness of our +method, we conduct a comprehensive comparison with state-of-the-art human NeRF +techniques and joint estimation algorithms. Our results show that GHNeRF can +achieve state-of-the-art results in near real-time. + +
+
+
+
+
+ + ☆ Anchor-based Robust Finetuning of Vision-Language Models CVPR2024 + + +
+ We aim at finetuning a vision-language model without hurting its +out-of-distribution (OOD) generalization. We address two types of OOD +generalization, i.e., i) domain shift such as natural to sketch images, and ii) +zero-shot capability to recognize the category that was not contained in the +finetune data. Arguably, the diminished OOD generalization after finetuning +stems from the excessively simplified finetuning target, which only provides +the class information, such as ``a photo of a [CLASS]''. This is distinct from +the process in that CLIP was pretrained, where there is abundant text +supervision with rich semantic information. Therefore, we propose to compensate +for the finetune process using auxiliary supervision with rich semantic +information, which acts as anchors to preserve the OOD generalization. +Specifically, two types of anchors are elaborated in our method, including i) +text-compensated anchor which uses the images from the finetune set but +enriches the text supervision from a pretrained captioner, ii) image-text-pair +anchor which is retrieved from the dataset similar to pretraining data of CLIP +according to the downstream task, associating with the original CLIP text with +rich semantics. Those anchors are utilized as auxiliary semantic information to +maintain the original feature space of CLIP, thereby preserving the OOD +generalization capabilities. Comprehensive experiments demonstrate that our +method achieves in-distribution performance akin to conventional finetuning +while attaining new state-of-the-art results on domain shift and zero-shot +learning benchmarks. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ ActNetFormer: Transformer-ResNet Hybrid Method for Semi-Supervised + Action Recognition in Videos + + +
+ Human action or activity recognition in videos is a fundamental task in +computer vision with applications in surveillance and monitoring, self-driving +cars, sports analytics, human-robot interaction and many more. Traditional +supervised methods require large annotated datasets for training, which are +expensive and time-consuming to acquire. This work proposes a novel approach +using Cross-Architecture Pseudo-Labeling with contrastive learning for +semi-supervised action recognition. Our framework leverages both labeled and +unlabelled data to robustly learn action representations in videos, combining +pseudo-labeling with contrastive learning for effective learning from both +types of samples. We introduce a novel cross-architecture approach where 3D +Convolutional Neural Networks (3D CNNs) and video transformers (VIT) are +utilised to capture different aspects of action representations; hence we call +it ActNetFormer. The 3D CNNs excel at capturing spatial features and local +dependencies in the temporal domain, while VIT excels at capturing long-range +dependencies across frames. By integrating these complementary architectures +within the ActNetFormer framework, our approach can effectively capture both +local and global contextual information of an action. This comprehensive +representation learning enables the model to achieve better performance in +semi-supervised action recognition tasks by leveraging the strengths of each of +these architectures. Experimental results on standard action recognition +datasets demonstrate that our approach performs better than the existing +methods, achieving state-of-the-art performance with only a fraction of labeled +data. The official website of this work is available at: +https://github.com/rana2149/ActNetFormer. + +
+
+ comment: Submitted for peer review +
+
+
+
+
+ + ☆ Hyperparameter-Free Medical Image Synthesis for Sharing Data and + Improving Site-Specific Segmentation + + +
+ Sharing synthetic medical images is a promising alternative to sharing real +images that can improve patient privacy and data security. To get good results, +existing methods for medical image synthesis must be manually adjusted when +they are applied to unseen data. To remove this manual burden, we introduce a +Hyperparameter-Free distributed learning method for automatic medical image +Synthesis, Sharing, and Segmentation called HyFree-S3. For three diverse +segmentation settings (pelvic MRIs, lung X-rays, polyp photos), the use of +HyFree-S3 results in improved performance over training only with site-specific +data (in the majority of cases). The hyperparameter-free nature of the method +should make data synthesis and sharing easier, potentially leading to an +increase in the quantity of available data and consequently the quality of the +models trained that may ultimately be applied in the clinic. Our code is +available at https://github.com/AwesomeLemon/HyFree-S3 + +
+
+ comment: Accepted at MIDL 2024 +
+
+
+
+
+ + ☆ Automatic Defect Detection in Sewer Network Using Deep Learning Based + Object Detector + + +
+ Maintaining sewer systems in large cities is important, but also time and +effort consuming, because visual inspections are currently done manually. To +reduce the amount of aforementioned manual work, defects within sewer pipes +should be located and classified automatically. In the past, multiple works +have attempted solving this problem using classical image processing, machine +learning, or a combination of those. However, each provided solution only focus +on detecting a limited set of defect/structure types, such as fissure, root, +and/or connection. Furthermore, due to the use of hand-crafted features and +small training datasets, generalization is also problematic. In order to +overcome these deficits, a sizable dataset with 14.7 km of various sewer pipes +were annotated by sewer maintenance experts in the scope of this work. On top +of that, an object detector (EfficientDet-D0) was trained for automatic defect +detection. From the result of several expermients, peculiar natures of defects +in the context of object detection, which greatly effect annotation and +training process, are found and discussed. At the end, the final detector was +able to detect 83% of defects in the test set; out of the missing 17%, only +0.77% are very severe defects. This work provides an example of applying deep +learning-based object detection into an important but quiet engineering field. +It also gives some practical pointers on how to annotate peculiar "object", +such as defects. + +
+
+
+
+
+ + ☆ OmniFusion Technical Report + + +
+ Last year, multimodal architectures served up a revolution in AI-based +approaches and solutions, extending the capabilities of large language models +(LLM). We propose an \textit{OmniFusion} model based on a pretrained LLM and +adapters for visual modality. We evaluated and compared several architecture +design principles for better text and visual data coupling: MLP and transformer +adapters, various CLIP ViT-based encoders (SigLIP, InternVIT, etc.), and their +fusing approach, image encoding method (whole image or tiles encoding) and two +7B LLMs (the proprietary one and open-source Mistral). Experiments on 8 +visual-language benchmarks show the top score for the best OmniFusion setup in +terms of different VQA tasks in comparison with open-source LLaVA-like +solutions: VizWiz, Pope, MM-Vet, ScienceQA, MMBench, TextVQA, VQAv2, MMMU. We +also propose a variety of situations, where OmniFusion provides highly-detailed +answers in different domains: housekeeping, sightseeing, culture, medicine, +handwritten and scanned equations recognition, etc. Mistral-based OmniFusion +model is an open-source solution with weights, training and inference scripts +available at https://github.com/AIRI-Institute/OmniFusion. + +
+
+ comment: 17 pages, 4 figures, 9 tables, 2 appendices +
+
+
+
+
+ + ☆ Unified Physical-Digital Attack Detection Challenge + + +
+ Face Anti-Spoofing (FAS) is crucial to safeguard Face Recognition (FR) +Systems. In real-world scenarios, FRs are confronted with both physical and +digital attacks. However, existing algorithms often address only one type of +attack at a time, which poses significant limitations in real-world scenarios +where FR systems face hybrid physical-digital threats. To facilitate the +research of Unified Attack Detection (UAD) algorithms, a large-scale +UniAttackData dataset has been collected. UniAttackData is the largest public +dataset for Unified Attack Detection, with a total of 28,706 videos, where each +unique identity encompasses all advanced attack types. Based on this dataset, +we organized a Unified Physical-Digital Face Attack Detection Challenge to +boost the research in Unified Attack Detections. It attracted 136 teams for the +development phase, with 13 qualifying for the final round. The results +re-verified by the organizing team were used for the final ranking. This paper +comprehensively reviews the challenge, detailing the dataset introduction, +protocol definition, evaluation criteria, and a summary of published results. +Finally, we focus on the detailed analysis of the highest-performing algorithms +and offer potential directions for unified physical-digital attack detection +inspired by this competition. Challenge Website: +https://sites.google.com/view/face-anti-spoofing-challenge/welcome/challengecvpr2024. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ☆ Leveraging edge detection and neural networks for better UAV + localization RSS2024 + + +
+ We propose a novel method for geolocalizing Unmanned Aerial Vehicles (UAVs) +in environments lacking Global Navigation Satellite Systems (GNSS). Current +state-of-the-art techniques employ an offline-trained encoder to generate a +vector representation (embedding) of the UAV's current view, which is then +compared with pre-computed embeddings of geo-referenced images to determine the +UAV's position. Here, we demonstrate that the performance of these methods can +be significantly enhanced by preprocessing the images to extract their edges, +which exhibit robustness to seasonal and illumination variations. Furthermore, +we establish that utilizing edges enhances resilience to orientation and +altitude inaccuracies. Additionally, we introduce a confidence criterion for +localization. Our findings are substantiated through synthetic experiments. + +
+
+ comment: Accepted for publication in IGARSS2024. 4 pages, 3 figures, 3 tables +
+
+
+
+
+ + ☆ Automated National Urban Map Extraction + + +
+ Developing countries usually lack the proper governance means to generate and +regularly update a national rooftop map. Using traditional photogrammetry and +surveying methods to produce a building map at the federal level is costly and +time consuming. Using earth observation and deep learning methods, we can +bridge this gap and propose an automated pipeline to fetch such national urban +maps. This paper aims to exploit the power of fully convolutional neural +networks for multi-class buildings' instance segmentation to leverage high +object-wise accuracy results. Buildings' instance segmentation from sub-meter +high-resolution satellite images can be achieved with relatively high +pixel-wise metric scores. We detail all engineering steps to replicate this +work and ensure highly accurate results in dense and slum areas witnessed in +regions that lack proper urban planning in the Global South. We applied a case +study of the proposed pipeline to Lebanon and successfully produced the first +comprehensive national building footprint map with approximately 1 Million +units with an 84% accuracy. The proposed architecture relies on advanced +augmentation techniques to overcome dataset scarcity, which is often the case +in developing countries. + +
+
+
+
+
+ + ☆ Exploring the Potential of Large Foundation Models for Open-Vocabulary + HOI Detection + + +
+ Open-vocabulary human-object interaction (HOI) detection, which is concerned +with the problem of detecting novel HOIs guided by natural language, is crucial +for understanding human-centric scenes. However, prior zero-shot HOI detectors +often employ the same levels of feature maps to model HOIs with varying +distances, leading to suboptimal performance in scenes containing human-object +pairs with a wide range of distances. In addition, these detectors primarily +rely on category names and overlook the rich contextual information that +language can provide, which is essential for capturing open vocabulary concepts +that are typically rare and not well-represented by category names alone. In +this paper, we introduce a novel end-to-end open vocabulary HOI detection +framework with conditional multi-level decoding and fine-grained semantic +enhancement (CMD-SE), harnessing the potential of Visual-Language Models +(VLMs). Specifically, we propose to model human-object pairs with different +distances with different levels of feature maps by incorporating a soft +constraint during the bipartite matching process. Furthermore, by leveraging +large language models (LLMs) such as GPT models, we exploit their extensive +world knowledge to generate descriptions of human body part states for various +interactions. Then we integrate the generalizable and fine-grained semantics of +human body parts to improve interaction recognition. Experimental results on +two datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method +achieves state-of-the-art results in open vocabulary HOI detection. The code +and models are available at https://github.com/ltttpku/CMD-SE-release. + +
+
+
+
+
+ + ☆ EPL: Evidential Prototype Learning for Semi-supervised Medical Image + Segmentation + + +
+ Although current semi-supervised medical segmentation methods can achieve +decent performance, they are still affected by the uncertainty in unlabeled +data and model predictions, and there is currently a lack of effective +strategies that can explore the uncertain aspects of both simultaneously. To +address the aforementioned issues, we propose Evidential Prototype Learning +(EPL), which utilizes an extended probabilistic framework to effectively fuse +voxel probability predictions from different sources and achieves prototype +fusion utilization of labeled and unlabeled data under a generalized evidential +framework, leveraging voxel-level dual uncertainty masking. The uncertainty not +only enables the model to self-correct predictions but also improves the guided +learning process with pseudo-labels and is able to feed back into the +construction of hidden features. The method proposed in this paper has been +experimented on LA, Pancreas-CT and TBAD datasets, achieving the +state-of-the-art performance in three different labeled ratios, which strongly +demonstrates the effectiveness of our strategy. + +
+
+
+
+
+ + ☆ YOLC: You Only Look Clusters for Tiny Object Detection in Aerial Images + + +
+ Detecting objects from aerial images poses significant challenges due to the +following factors: 1) Aerial images typically have very large sizes, generally +with millions or even hundreds of millions of pixels, while computational +resources are limited. 2) Small object size leads to insufficient information +for effective detection. 3) Non-uniform object distribution leads to +computational resource wastage. To address these issues, we propose YOLC (You +Only Look Clusters), an efficient and effective framework that builds on an +anchor-free object detector, CenterNet. To overcome the challenges posed by +large-scale images and non-uniform object distribution, we introduce a Local +Scale Module (LSM) that adaptively searches cluster regions for zooming in for +accurate detection. Additionally, we modify the regression loss using Gaussian +Wasserstein distance (GWD) to obtain high-quality bounding boxes. Deformable +convolution and refinement methods are employed in the detection head to +enhance the detection of small objects. We perform extensive experiments on two +aerial image datasets, including Visdrone2019 and UAVDT, to demonstrate the +effectiveness and superiority of our proposed approach. + +
+
+ comment: accepted to TITS +
+
+
+
+
+ + ☆ Uncertainty-aware Evidential Fusion-based Learning for Semi-supervised + Medical Image Segmentation + + +
+ Although the existing uncertainty-based semi-supervised medical segmentation +methods have achieved excellent performance, they usually only consider a +single uncertainty evaluation, which often fails to solve the problem related +to credibility completely. Therefore, based on the framework of evidential deep +learning, this paper integrates the evidential predictive results in the +cross-region of mixed and original samples to reallocate the confidence degree +and uncertainty measure of each voxel, which is realized by emphasizing +uncertain information of probability assignments fusion rule of traditional +evidence theory. Furthermore, we design a voxel-level asymptotic learning +strategy by introducing information entropy to combine with the fused +uncertainty measure to estimate voxel prediction more precisely. The model will +gradually pay attention to the prediction results with high uncertainty in the +learning process, to learn the features that are difficult to master. The +experimental results on LA, Pancreas-CT, ACDC and TBAD datasets demonstrate the +superior performance of our proposed method in comparison with the existing +state of the arts. + +
+
+
+
+
+ + ☆ Improving Interpretable Embeddings for Ad-hoc Video Search with + Generative Captions and Multi-word Concept Bank + + +
+ Aligning a user query and video clips in cross-modal latent space and that +with semantic concepts are two mainstream approaches for ad-hoc video search +(AVS). However, the effectiveness of existing approaches is bottlenecked by the +small sizes of available video-text datasets and the low quality of concept +banks, which results in the failures of unseen queries and the +out-of-vocabulary problem. This paper addresses these two problems by +constructing a new dataset and developing a multi-word concept bank. +Specifically, capitalizing on a generative model, we construct a new dataset +consisting of 7 million generated text and video pairs for pre-training. To +tackle the out-of-vocabulary problem, we develop a multi-word concept bank +based on syntax analysis to enhance the capability of a state-of-the-art +interpretable AVS method in modeling relationships between query words. We also +study the impact of current advanced features on the method. Experimental +results show that the integration of the above-proposed elements doubles the +R@1 performance of the AVS method on the MSRVTT dataset and improves the xinfAP +on the TRECVid AVS query sets for 2016-2023 (eight years) by a margin from 2% +to 77%, with an average about 20%. + +
+
+ comment: Accepted in ICMR2024 +
+
+
+
+
+ + ☆ Enhanced Radar Perception via Multi-Task Learning: Towards Refined Data + for Sensor Fusion Applications + + +
+ Radar and camera fusion yields robustness in perception tasks by leveraging +the strength of both sensors. The typical extracted radar point cloud is 2D +without height information due to insufficient antennas along the elevation +axis, which challenges the network performance. This work introduces a +learning-based approach to infer the height of radar points associated with 3D +objects. A novel robust regression loss is introduced to address the sparse +target challenge. In addition, a multi-task training strategy is employed, +emphasizing important features. The average radar absolute height error +decreases from 1.69 to 0.25 meters compared to the state-of-the-art height +extension method. The estimated target height values are used to preprocess and +enrich radar data for downstream perception tasks. Integrating this refined +radar information further enhances the performance of existing radar camera +fusion models for object detection and depth estimation tasks. + +
+
+ comment: Accepted by IEEE Intelligent Vehicles Symposium (IV 2024) +
+
+
+
+
+ + ☆ Efficient and Robust Point Cloud Registration via Heuristics-guided + Parameter Search + + +
+ Estimating the rigid transformation with 6 degrees of freedom based on a +putative 3D correspondence set is a crucial procedure in point cloud +registration. Existing correspondence identification methods usually lead to +large outlier ratios ($>$ 95 $\%$ is common), underscoring the significance of +robust registration methods. Many researchers turn to parameter search-based +strategies (e.g., Branch-and-Bround) for robust registration. Although related +methods show high robustness, their efficiency is limited to the +high-dimensional search space. This paper proposes a heuristics-guided +parameter search strategy to accelerate the search while maintaining high +robustness. We first sample some correspondences (i.e., heuristics) and then +just need to sequentially search the feasible regions that make each sample an +inlier. Our strategy largely reduces the search space and can guarantee +accuracy with only a few inlier samples, therefore enjoying an excellent +trade-off between efficiency and robustness. Since directly parameterizing the +6-dimensional nonlinear feasible region for efficient search is intractable, we +construct a three-stage decomposition pipeline to reparameterize the feasible +region, resulting in three lower-dimensional sub-problems that are easily +solvable via our strategy. Besides reducing the searching dimension, our +decomposition enables the leverage of 1-dimensional interval stabbing at all +three stages for searching acceleration. Moreover, we propose a valid sampling +strategy to guarantee our sampling effectiveness, and a compatibility +verification setup to further accelerate our search. Extensive experiments on +both simulated and real-world datasets demonstrate that our approach exhibits +comparable robustness with state-of-the-art methods while achieving a +significant efficiency boost. + +
+
+ comment: 21 pages, 16 figures. Accepted to IEEE Transactions on Pattern + Analysis and Machine Intelligence, 2024 +
+
+
+
+
+ + ☆ Concise Plane Arrangements for Low-Poly Surface and Volume Modelling + + +
+ Plane arrangements are a useful tool for surface and volume modelling. +However, their main drawback is poor scalability. We introduce two key +novelties that enable the construction of plane arrangements for complex +objects and entire scenes: an ordering scheme for the plane insertion and the +direct use of input points during arrangement construction. Both ingredients +reduce the number of unwanted splits, resulting in improved scalability of the +construction mechanism by up to two orders of magnitude compared to existing +algorithms. We further introduce a remeshing and simplification technique that +allows us to extract low-polygon surface meshes and lightweight convex +decompositions of volumes from the arrangement. We show that our approach leads +to state-of-the-art results for the aforementioned tasks by comparing it to +learning-based and traditional approaches on various different datasets. Our +implementation is available at https://github.com/raphaelsulzer/compod . + +
+
+
+
+
+ + ☆ HFNeRF: Learning Human Biomechanic Features with Neural Radiance Fields + + +
+ In recent advancements in novel view synthesis, generalizable Neural Radiance +Fields (NeRF) based methods applied to human subjects have shown remarkable +results in generating novel views from few images. However, this generalization +ability cannot capture the underlying structural features of the skeleton +shared across all instances. Building upon this, we introduce HFNeRF: a novel +generalizable human feature NeRF aimed at generating human biomechanic features +using a pre-trained image encoder. While previous human NeRF methods have shown +promising results in the generation of photorealistic virtual avatars, such +methods lack underlying human structure or biomechanic features such as +skeleton or joint information that are crucial for downstream applications +including Augmented Reality (AR)/Virtual Reality (VR). HFNeRF leverages 2D +pre-trained foundation models toward learning human features in 3D using neural +rendering, and then volume rendering towards generating 2D feature maps. We +evaluate HFNeRF in the skeleton estimation task by predicting heatmaps as +features. The proposed method is fully differentiable, allowing to successfully +learn color, geometry, and human skeleton in a simultaneous manner. This paper +presents preliminary results of HFNeRF, illustrating its potential in +generating realistic virtual avatars with biomechanic features using NeRF. + +
+
+
+
+
+ + ☆ DiffHarmony: Latent Diffusion Model Meets Image Harmonization + + +
+ Image harmonization, which involves adjusting the foreground of a composite +image to attain a unified visual consistency with the background, can be +conceptualized as an image-to-image translation task. Diffusion models have +recently promoted the rapid development of image-to-image translation tasks . +However, training diffusion models from scratch is computationally intensive. +Fine-tuning pre-trained latent diffusion models entails dealing with the +reconstruction error induced by the image compression autoencoder, making it +unsuitable for image generation tasks that involve pixel-level evaluation +metrics. To deal with these issues, in this paper, we first adapt a pre-trained +latent diffusion model to the image harmonization task to generate the +harmonious but potentially blurry initial images. Then we implement two +strategies: utilizing higher-resolution images during inference and +incorporating an additional refinement stage, to further enhance the clarity of +the initially harmonized images. Extensive experiments on iHarmony4 datasets +demonstrate the superiority of our proposed method. The code and model will be +made publicly available at https://github.com/nicecv/DiffHarmony . + +
+
+ comment: Accepted by ICMR 2024 +
+
+
+
+
+ + ☆ Mansformer: Efficient Transformer of Mixed Attention for Image + Deblurring and Beyond + + +
+ Transformer has made an enormous success in natural language processing and +high-level vision over the past few years. However, the complexity of +self-attention is quadratic to the image size, which makes it infeasible for +high-resolution vision tasks. In this paper, we propose the Mansformer, a +Transformer of mixed attention that combines multiple self-attentions, gate, +and multi-layer perceptions (MLPs), to explore and employ more possibilities of +self-attention. Taking efficiency into account, we design four kinds of +self-attention, whose complexities are all linear. By elaborate adjustment of +the tensor shapes and dimensions for the dot product, we split the typical +self-attention of quadratic complexity into four operations of linear +complexity. To adaptively merge these different kinds of self-attention, we +take advantage of an architecture similar to Squeeze-and-Excitation Networks. +Furthermore, we make it to merge the two-staged Transformer design into one +stage by the proposed gated-dconv MLP. Image deblurring is our main target, +while extensive quantitative and qualitative evaluations show that this method +performs favorably against the state-of-the-art methods far more than simply +deblurring. The source codes and trained models will be made available to the +public. + +
+
+
+
+
+ + ☆ Gaussian Pancakes: Geometrically-Regularized 3D Gaussian Splatting for + Realistic Endoscopic Reconstruction + + +
+ Within colorectal cancer diagnostics, conventional colonoscopy techniques +face critical limitations, including a limited field of view and a lack of +depth information, which can impede the detection of precancerous lesions. +Current methods struggle to provide comprehensive and accurate 3D +reconstructions of the colonic surface which can help minimize the missing +regions and reinspection for pre-cancerous polyps. Addressing this, we +introduce 'Gaussian Pancakes', a method that leverages 3D Gaussian Splatting +(3D GS) combined with a Recurrent Neural Network-based Simultaneous +Localization and Mapping (RNNSLAM) system. By introducing geometric and depth +regularization into the 3D GS framework, our approach ensures more accurate +alignment of Gaussians with the colon surface, resulting in smoother 3D +reconstructions with novel viewing of detailed textures and structures. +Evaluations across three diverse datasets show that Gaussian Pancakes enhances +novel view synthesis quality, surpassing current leading methods with a 18% +boost in PSNR and a 16% improvement in SSIM. It also delivers over 100X faster +rendering and more than 10X shorter training times, making it a practical tool +for real-time applications. Hence, this holds promise for achieving clinical +translation for better detection and diagnosis of colorectal cancer. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Hierarchical Insights: Exploiting Structural Similarities for Reliable + 3D Semantic Segmentation IROS 2024 + + +
+ Safety-critical applications like autonomous driving call for robust 3D +environment perception algorithms which can withstand highly diverse and +ambiguous surroundings. The predictive performance of any classification model +strongly depends on the underlying dataset and the prior knowledge conveyed by +the annotated labels. While the labels provide a basis for the learning +process, they usually fail to represent inherent relations between the classes +- representations, which are a natural element of the human perception system. +We propose a training strategy which enables a 3D LiDAR semantic segmentation +model to learn structural relationships between the different classes through +abstraction. We achieve this by implicitly modeling those relationships through +a learning rule for hierarchical multi-label classification (HMC). With a +detailed analysis we show, how this training strategy not only improves the +model's confidence calibration, but also preserves additional information for +downstream tasks like fusion, prediction and planning. + +
+
+ comment: submitted to IROS 2024 +
+
+
+
+
+ + ☆ DreamView: Injecting View-specific Text Guidance into Text-to-3D + Generation + + +
+ Text-to-3D generation, which synthesizes 3D assets according to an overall +text description, has significantly progressed. However, a challenge arises +when the specific appearances need customizing at designated viewpoints but +referring solely to the overall description for generating 3D objects. For +instance, ambiguity easily occurs when producing a T-shirt with distinct +patterns on its front and back using a single overall text guidance. In this +work, we propose DreamView, a text-to-image approach enabling multi-view +customization while maintaining overall consistency by adaptively injecting the +view-specific and overall text guidance through a collaborative text guidance +injection module, which can also be lifted to 3D generation via score +distillation sampling. DreamView is trained with large-scale rendered +multi-view images and their corresponding view-specific texts to learn to +balance the separate content manipulation in each view and the global +consistency of the overall object, resulting in a dual achievement of +customization and consistency. Consequently, DreamView empowers artists to +design 3D objects creatively, fostering the creation of more innovative and +diverse 3D assets. Code and model will be released at +https://github.com/iSEE-Laboratory/DreamView. + +
+
+
+
+
+ + ☆ Revising Densification in Gaussian Splatting + + +
+ In this paper, we address the limitations of Adaptive Density Control (ADC) +in 3D Gaussian Splatting (3DGS), a scene representation method achieving +high-quality, photorealistic results for novel view synthesis. ADC has been +introduced for automatic 3D point primitive management, controlling +densification and pruning, however, with certain limitations in the +densification logic. Our main contribution is a more principled, pixel-error +driven formulation for density control in 3DGS, leveraging an auxiliary, +per-pixel error function as the criterion for densification. We further +introduce a mechanism to control the total number of primitives generated per +scene and correct a bias in the current opacity handling strategy of ADC during +cloning operations. Our approach leads to consistent quality improvements +across a variety of benchmark scenes, without sacrificing the method's +efficiency. + +
+
+
+
+
+ + ☆ Hash3D: Training-free Acceleration for 3D Generation + + +
+ The evolution of 3D generative modeling has been notably propelled by the +adoption of 2D diffusion models. Despite this progress, the cumbersome +optimization process per se presents a critical hurdle to efficiency. In this +paper, we introduce Hash3D, a universal acceleration for 3D generation without +model training. Central to Hash3D is the insight that feature-map redundancy is +prevalent in images rendered from camera positions and diffusion time-steps in +close proximity. By effectively hashing and reusing these feature maps across +neighboring timesteps and camera angles, Hash3D substantially prevents +redundant calculations, thus accelerating the diffusion model's inference in 3D +generation tasks. We achieve this through an adaptive grid-based hashing. +Surprisingly, this feature-sharing mechanism not only speed up the generation +but also enhances the smoothness and view consistency of the synthesized 3D +objects. Our experiments covering 5 text-to-3D and 3 image-to-3D models, +demonstrate Hash3D's versatility to speed up optimization, enhancing efficiency +by 1.3 to 4 times. Additionally, Hash3D's integration with 3D Gaussian +splatting largely speeds up 3D model creation, reducing text-to-3D processing +to about 10 minutes and image-to-3D conversion to roughly 30 seconds. The +project page is at https://adamdad.github.io/hash3D/. + +
+
+ comment: https://adamdad.github.io/hash3D/ +
+
+
+
+
+ + ☆ Using Few-Shot Learning to Classify Primary Lung Cancer and Other + Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial + Ultrasound Procedures + + +
+ This study aims to establish a computer-aided diagnosis system for +endobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary +diagnosis of metastatic cancer. This involves arranging immediate examinations +for other sites of metastatic cancer after EBUS surgery, eliminating the need +to wait for reports, thereby shortening the waiting time by more than half and +enabling patients to detect other cancers earlier, allowing for early planning +and implementation of treatment plans. Unlike previous studies on cell image +classification, which have abundant datasets for training, this study must also +be able to make effective classifications despite the limited amount of case +data for lung metastatic cancer. In the realm of small data set classification +methods, Few-shot learning (FSL) has become mainstream in recent years. Through +its ability to train on small datasets and its strong generalization +capabilities, FSL shows potential in this task of lung metastatic cell image +classification. This study will adopt the approach of Few-shot learning, +referencing existing proposed models, and designing a model architecture for +classifying lung metastases cell images. Batch Spectral Regularization (BSR) +will be incorporated as a loss update parameter, and the Finetune method of PMF +will be modified. In terms of test results, the addition of BSR and the +modified Finetune method further increases the accuracy by 8.89% to 65.60%, +outperforming other FSL methods. This study confirms that FSL is superior to +supervised and transfer learning in classifying metastatic cancer and +demonstrates that using BSR as a loss function and modifying Finetune can +enhance the model's capabilities. + +
+
+
+
+
+ + ☆ LIPT: Latency-aware Image Processing Transformer + + +
+ Transformer is leading a trend in the field of image processing. Despite the +great success that existing lightweight image processing transformers have +achieved, they are tailored to FLOPs or parameters reduction, rather than +practical inference acceleration. In this paper, we present a latency-aware +image processing transformer, termed LIPT. We devise the low-latency proportion +LIPT block that substitutes memory-intensive operators with the combination of +self-attention and convolutions to achieve practical speedup. Specifically, we +propose a novel non-volatile sparse masking self-attention (NVSM-SA) that +utilizes a pre-computing sparse mask to capture contextual information from a +larger window with no extra computation overload. Besides, a high-frequency +reparameterization module (HRM) is proposed to make LIPT block +reparameterization friendly, which improves the model's detail reconstruction +capability. Extensive experiments on multiple image processing tasks (e.g., +image super-resolution (SR), JPEG artifact reduction, and image denoising) +demonstrate the superiority of LIPT on both latency and PSNR. LIPT achieves +real-time GPU inference with state-of-the-art performance on multiple image SR +benchmarks. + +
+
+
+
+
+ + ☆ Unified Entropy Optimization for Open-Set Test-Time Adaptation CVPR 2024 + + +
+ Test-time adaptation (TTA) aims at adapting a model pre-trained on the +labeled source domain to the unlabeled target domain. Existing methods usually +focus on improving TTA performance under covariate shifts, while neglecting +semantic shifts. In this paper, we delve into a realistic open-set TTA setting +where the target domain may contain samples from unknown classes. Many +state-of-the-art closed-set TTA methods perform poorly when applied to open-set +scenarios, which can be attributed to the inaccurate estimation of data +distribution and model confidence. To address these issues, we propose a simple +but effective framework called unified entropy optimization (UniEnt), which is +capable of simultaneously adapting to covariate-shifted in-distribution (csID) +data and detecting covariate-shifted out-of-distribution (csOOD) data. +Specifically, UniEnt first mines pseudo-csID and pseudo-csOOD samples from test +data, followed by entropy minimization on the pseudo-csID data and entropy +maximization on the pseudo-csOOD data. Furthermore, we introduce UniEnt+ to +alleviate the noise caused by hard data partition leveraging sample-level +confidence. Extensive experiments on CIFAR benchmarks and Tiny-ImageNet-C show +the superiority of our framework. The code is available at +https://github.com/gaozhengqing/UniEnt + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Unified Multi-modal Diagnostic Framework with Reconstruction + Pre-training and Heterogeneity-combat Tuning + + +
+ Medical multi-modal pre-training has revealed promise in computer-aided +diagnosis by leveraging large-scale unlabeled datasets. However, existing +methods based on masked autoencoders mainly rely on data-level reconstruction +tasks, but lack high-level semantic information. Furthermore, two significant +heterogeneity challenges hinder the transfer of pre-trained knowledge to +downstream tasks, \textit{i.e.}, the distribution heterogeneity between +pre-training data and downstream data, and the modality heterogeneity within +downstream data. To address these challenges, we propose a Unified Medical +Multi-modal Diagnostic (UMD) framework with tailored pre-training and +downstream tuning strategies. Specifically, to enhance the representation +abilities of vision and language encoders, we propose the Multi-level +Reconstruction Pre-training (MR-Pretrain) strategy, including a feature-level +and data-level reconstruction, which guides models to capture the semantic +information from masked inputs of different modalities. Moreover, to tackle two +kinds of heterogeneities during the downstream tuning, we present the +heterogeneity-combat downstream tuning strategy, which consists of a +Task-oriented Distribution Calibration (TD-Calib) and a Gradient-guided +Modality Coordination (GM-Coord). In particular, TD-Calib fine-tunes the +pre-trained model regarding the distribution of downstream datasets, and +GM-Coord adjusts the gradient weights according to the dynamic optimization +status of different modalities. Extensive experiments on five public medical +datasets demonstrate the effectiveness of our UMD framework, which remarkably +outperforms existing approaches on three kinds of downstream tasks. + +
+
+ comment: to be published in IEEE JBHI; Code available at + https://github.com/helenypzhang/UMD +
+
+
+
+
+ + ☆ Incremental Joint Learning of Depth, Pose and Implicit Scene + Representation on Monocular Camera in Large-scale Scenes + + +
+ Dense scene reconstruction for photo-realistic view synthesis has various +applications, such as VR/AR, autonomous vehicles. However, most existing +methods have difficulties in large-scale scenes due to three core challenges: +\textit{(a) inaccurate depth input.} Accurate depth input is impossible to get +in real-world large-scale scenes. \textit{(b) inaccurate pose estimation.} Most +existing approaches rely on accurate pre-estimated camera poses. \textit{(c) +insufficient scene representation capability.} A single global radiance field +lacks the capacity to effectively scale to large-scale scenes. To this end, we +propose an incremental joint learning framework, which can achieve accurate +depth, pose estimation, and large-scale scene reconstruction. A vision +transformer-based network is adopted as the backbone to enhance performance in +scale information estimation. For pose estimation, a feature-metric bundle +adjustment (FBA) method is designed for accurate and robust camera tracking in +large-scale scenes. In terms of implicit scene representation, we propose an +incremental scene representation method to construct the entire large-scale +scene as multiple local radiance fields to enhance the scalability of 3D scene +representation. Extended experiments have been conducted to demonstrate the +effectiveness and accuracy of our method in depth estimation, pose estimation, +and large-scale scene reconstruction. + +
+
+
+
+
+ + ☆ Object Dynamics Modeling with Hierarchical Point Cloud-based + Representations CVPR 2024 + + +
+ Modeling object dynamics with a neural network is an important problem with +numerous applications. Most recent work has been based on graph neural +networks. However, physics happens in 3D space, where geometric information +potentially plays an important role in modeling physical phenomena. In this +work, we propose a novel U-net architecture based on continuous point +convolution which naturally embeds information from 3D coordinates and allows +for multi-scale feature representations with established downsampling and +upsampling procedures. Bottleneck layers in the downsampled point clouds lead +to better long-range interaction modeling. Besides, the flexibility of point +convolutions allows our approach to generalize to sparsely sampled points from +mesh vertices and dynamically generate features on important interaction points +on mesh faces. Experimental results demonstrate that our approach significantly +improves the state-of-the-art, especially in scenarios that require accurate +gravity or collision reasoning. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Space-Time Video Super-resolution with Neural Operator + + +
+ This paper addresses the task of space-time video super-resolution (ST-VSR). +Existing methods generally suffer from inaccurate motion estimation and motion +compensation (MEMC) problems for large motions. Inspired by recent progress in +physics-informed neural networks, we model the challenges of MEMC in ST-VSR as +a mapping between two continuous function spaces. Specifically, our approach +transforms independent low-resolution representations in the coarse-grained +continuous function space into refined representations with enriched +spatiotemporal details in the fine-grained continuous function space. To +achieve efficient and accurate MEMC, we design a Galerkin-type attention +function to perform frame alignment and temporal interpolation. Due to the +linear complexity of the Galerkin-type attention mechanism, our model avoids +patch partitioning and offers global receptive fields, enabling precise +estimation of large motions. The experimental results show that the proposed +method surpasses state-of-the-art techniques in both fixed-size and continuous +space-time video super-resolution tasks. + +
+
+
+
+
+ + ☆ Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for + Multi-exposure Image Fusion + + +
+ In recent years, deep learning networks have made remarkable strides in the +domain of multi-exposure image fusion. Nonetheless, prevailing approaches often +involve directly feeding over-exposed and under-exposed images into the +network, which leads to the under-utilization of inherent information present +in the source images. Additionally, unsupervised techniques predominantly +employ rudimentary weighted summation for color channel processing, culminating +in an overall desaturated final image tone. To partially mitigate these issues, +this study proposes a gamma correction module specifically designed to fully +leverage latent information embedded within source images. Furthermore, a +modified transformer block, embracing with self-attention mechanisms, is +introduced to optimize the fusion process. Ultimately, a novel color +enhancement algorithm is presented to augment image saturation while preserving +intricate details. The source code is available at this https://github.com/ZhiyingDu/BHFMEF url. + +
+
+
+
+
+ + ☆ Improving Facial Landmark Detection Accuracy and Efficiency with + Knowledge Distillation + + +
+ The domain of computer vision has experienced significant advancements in +facial-landmark detection, becoming increasingly essential across various +applications such as augmented reality, facial recognition, and emotion +analysis. Unlike object detection or semantic segmentation, which focus on +identifying objects and outlining boundaries, faciallandmark detection aims to +precisely locate and track critical facial features. However, deploying deep +learning-based facial-landmark detection models on embedded systems with +limited computational resources poses challenges due to the complexity of +facial features, especially in dynamic settings. Additionally, ensuring +robustness across diverse ethnicities and expressions presents further +obstacles. Existing datasets often lack comprehensive representation of facial +nuances, particularly within populations like those in Taiwan. This paper +introduces a novel approach to address these challenges through the development +of a knowledge distillation method. By transferring knowledge from larger +models to smaller ones, we aim to create lightweight yet powerful deep learning +models tailored specifically for facial-landmark detection tasks. Our goal is +to design models capable of accurately locating facial landmarks under varying +conditions, including diverse expressions, orientations, and lighting +environments. The ultimate objective is to achieve high accuracy and real-time +performance suitable for deployment on embedded systems. This method was +successfully implemented and achieved a top 6th place finish out of 165 +participants in the IEEE ICME 2024 PAIR competition. + +
+
+ comment: technical report. 6th/165 in IEEE ICME 2024 PAIR competition +
+
+
+
+
+ + ☆ Greedy-DiM: Greedy Algorithms for Unreasonably Effective Face Morphs + + +
+ Morphing attacks are an emerging threat to state-of-the-art Face Recognition +(FR) systems, which aim to create a single image that contains the biometric +information of multiple identities. Diffusion Morphs (DiM) are a recently +proposed morphing attack that has achieved state-of-the-art performance for +representation-based morphing attacks. However, none of the existing research +on DiMs have leveraged the iterative nature of DiMs and left the DiM model as a +black box, treating it no differently than one would a Generative Adversarial +Network (GAN) or Varational AutoEncoder (VAE). We propose a greedy strategy on +the iterative sampling process of DiM models which searches for an optimal step +guided by an identity-based heuristic function. We compare our proposed +algorithm against ten other state-of-the-art morphing algorithms using the +open-source SYN-MAD 2022 competition dataset. We find that our proposed +algorithm is unreasonably effective, fooling all of the tested FR systems with +an MMPMR of 100%, outperforming all other morphing algorithms compared. + +
+
+ comment: Initial preprint. Under review +
+
+
+
+
+ + ☆ Band-Attention Modulated RetNet for Face Forgery Detection + + +
+ The transformer networks are extensively utilized in face forgery detection +due to their scalability across large datasets.Despite their success, +transformers face challenges in balancing the capture of global context, which +is crucial for unveiling forgery clues, with computational complexity.To +mitigate this issue, we introduce Band-Attention modulated RetNet (BAR-Net), a +lightweight network designed to efficiently process extensive visual contexts +while avoiding catastrophic forgetting.Our approach empowers the target token +to perceive global information by assigning differential attention levels to +tokens at varying distances. We implement self-attention along both spatial +axes, thereby maintaining spatial priors and easing the computational +burden.Moreover, we present the adaptive frequency Band-Attention Modulation +mechanism, which treats the entire Discrete Cosine Transform spectrogram as a +series of frequency bands with learnable weights.Together, BAR-Net achieves +favorable performance on several face forgery datasets, outperforming current +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Diffusion-Based Point Cloud Super-Resolution for mmWave Radar Data + + +
+ The millimeter-wave radar sensor maintains stable performance under adverse +environmental conditions, making it a promising solution for all-weather +perception tasks, such as outdoor mobile robotics. However, the radar point +clouds are relatively sparse and contain massive ghost points, which greatly +limits the development of mmWave radar technology. In this paper, we propose a +novel point cloud super-resolution approach for 3D mmWave radar data, named +Radar-diffusion. Our approach employs the diffusion model defined by +mean-reverting stochastic differential equations(SDE). Using our proposed new +objective function with supervision from corresponding LiDAR point clouds, our +approach efficiently handles radar ghost points and enhances the sparse mmWave +radar point clouds to dense LiDAR-like point clouds. We evaluate our approach +on two different datasets, and the experimental results show that our method +outperforms the state-of-the-art baseline methods in 3D radar super-resolution +tasks. Furthermore, we demonstrate that our enhanced radar point cloud is +capable of downstream radar point-based registration tasks. + +
+
+
+
+
+ + ☆ Concept-Attention Whitening for Interpretable Skin Lesion Diagnosis + + +
+ The black-box nature of deep learning models has raised concerns about their +interpretability for successful deployment in real-world clinical applications. +To address the concerns, eXplainable Artificial Intelligence (XAI) aims to +provide clear and understandable explanations of the decision-making process. +In the medical domain, concepts such as attributes of lesions or abnormalities +serve as key evidence for deriving diagnostic results. However, existing +concept-based models mainly depend on concepts that appear independently and +require fine-grained concept annotations such as bounding boxes. A medical +image usually contains multiple concepts and the fine-grained concept +annotations are difficult to acquire. In this paper, we propose a novel +Concept-Attention Whitening (CAW) framework for interpretable skin lesion +diagnosis. CAW is comprised of a disease diagnosis branch and a concept +alignment branch. In the former branch, we train the CNN with a CAW layer +inserted to perform skin lesion diagnosis. The CAW layer decorrelates features +and aligns image features to conceptual meanings via an orthogonal matrix. In +the latter branch, we calculate the orthogonal matrix under the guidance of the +concept attention mask. We particularly introduce a weakly-supervised concept +mask generator that only leverages coarse concept labels for filtering local +regions that are relevant to certain concepts, improving the optimization of +the orthogonal matrix. Extensive experiments on two public skin lesion +diagnosis datasets demonstrated that CAW not only enhanced interpretability but +also maintained a state-of-the-art diagnostic performance. + +
+
+
+
+
+ + ☆ A Lightweight Measure of Classification Difficulty from Application + Dataset Characteristics + + +
+ Despite accuracy and computation benchmarks being widely available to help +choose among neural network models, these are usually trained on datasets with +many classes, and do not give a precise idea of performance for applications of +few (< 10) classes. The conventional procedure to predict performance is to +train and test repeatedly on the different models and dataset variations of +interest. However, this is computationally expensive. We propose an efficient +classification difficulty measure that is calculated from the number of classes +and intra- and inter-class similarity metrics of the dataset. After a single +stage of training and testing per model family, relative performance for +different datasets and models of the same family can be predicted by comparing +difficulty measures - without further training and testing. We show how this +measure can help a practitioner select a computationally efficient model for a +small dataset 6 to 29x faster than through repeated training and testing. We +give an example of use of the measure for an industrial application in which +options are identified to select a model 42% smaller than the baseline +YOLOv5-nano model, and if class merging from 3 to 2 classes meets requirements, +85% smaller. + +
+
+ comment: 13 pages, 3 figures +
+
+
+
+
+ + ☆ Tackling Structural Hallucination in Image Translation with Local + Diffusion + + +
+ Recent developments in diffusion models have advanced conditioned image +generation, yet they struggle with reconstructing out-of-distribution (OOD) +images, such as unseen tumors in medical images, causing ``image +hallucination'' and risking misdiagnosis. We hypothesize such hallucinations +result from local OOD regions in the conditional images. We verify that +partitioning the OOD region and conducting separate image generations +alleviates hallucinations in several applications. From this, we propose a +training-free diffusion framework that reduces hallucination with multiple +Local Diffusion processes. Our approach involves OOD estimation followed by two +modules: a ``branching'' module generates locally both within and outside OOD +regions, and a ``fusion'' module integrates these predictions into one. Our +evaluation shows our method mitigates hallucination over baseline models +quantitatively and qualitatively, reducing misdiagnosis by 40% and 25% in the +real-world medical and natural image datasets, respectively. It also +demonstrates compatibility with various pre-trained diffusion models. + +
+
+
+
+
+ + ☆ StoryImager: A Unified and Efficient Framework for Coherent Story + Visualization and Completion + + +
+ Story visualization aims to generate a series of realistic and coherent +images based on a storyline. Current models adopt a frame-by-frame architecture +by transforming the pre-trained text-to-image model into an auto-regressive +manner. Although these models have shown notable progress, there are still +three flaws. 1) The unidirectional generation of auto-regressive manner +restricts the usability in many scenarios. 2) The additional introduced story +history encoders bring an extremely high computational cost. 3) The story +visualization and continuation models are trained and inferred independently, +which is not user-friendly. To these ends, we propose a bidirectional, unified, +and efficient framework, namely StoryImager. The StoryImager enhances the +storyboard generative ability inherited from the pre-trained text-to-image +model for a bidirectional generation. Specifically, we introduce a Target Frame +Masking Strategy to extend and unify different story image generation tasks. +Furthermore, we propose a Frame-Story Cross Attention Module that decomposes +the cross attention for local fidelity and global coherence. Moreover, we +design a Contextual Feature Extractor to extract contextual information from +the whole storyline. The extensive experimental results demonstrate the +excellent performance of our StoryImager. The code is available at +https://github.com/tobran/StoryImager. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ JSTR: Judgment Improves Scene Text Recognition + + +
+ In this paper, we present a method for enhancing the accuracy of scene text +recognition tasks by judging whether the image and text match each other. While +previous studies focused on generating the recognition results from input +images, our approach also considers the model's misrecognition results to +understand its error tendencies, thus improving the text recognition pipeline. +This method boosts text recognition accuracy by providing explicit feedback on +the data that the model is likely to misrecognize by predicting correct or +incorrect between the image and text. The experimental results on publicly +available datasets demonstrate that our proposed method outperforms the +baseline and state-of-the-art methods in scene text recognition. + +
+
+ comment: IntelliSys 2024 +
+
+
+
+
+ + ☆ EasyTrack: Efficient and Compact One-stream 3D Point Clouds Tracker + + +
+ Most of 3D single object trackers (SOT) in point clouds follow the two-stream +multi-stage 3D Siamese or motion tracking paradigms, which process the template +and search area point clouds with two parallel branches, built on supervised +point cloud backbones. In this work, beyond typical 3D Siamese or motion +tracking, we propose a neat and compact one-stream transformer 3D SOT paradigm +from the novel perspective, termed as \textbf{EasyTrack}, which consists of +three special designs: 1) A 3D point clouds tracking feature pre-training +module is developed to exploit the masked autoencoding for learning 3D point +clouds tracking representations. 2) A unified 3D tracking feature learning and +fusion network is proposed to simultaneously learns target-aware 3D features, +and extensively captures mutual correlation through the flexible self-attention +mechanism. 3) A target location network in the dense bird's eye view (BEV) +feature space is constructed for target classification and regression. +Moreover, we develop an enhanced version named EasyTrack++, which designs the +center points interaction (CPI) strategy to reduce the ambiguous targets caused +by the noise point cloud background information. The proposed EasyTrack and +EasyTrack++ set a new state-of-the-art performance ($\textbf{18\%}$, +$\textbf{40\%}$ and $\textbf{3\%}$ success gains) in KITTI, NuScenes, and Waymo +while runing at \textbf{52.6fps} with few parameters (\textbf{1.3M}). The code +will be available at https://github.com/KnightApple427/Easytrack. + +
+
+
+
+
+ + ☆ Prompt-driven Universal Model for View-Agnostic Echocardiography + Analysis + + +
+ Echocardiography segmentation for cardiac analysis is time-consuming and +resource-intensive due to the variability in image quality and the necessity to +process scans from various standard views. While current automated segmentation +methods in echocardiography show promising performance, they are trained on +specific scan views to analyze corresponding data. However, this solution has a +limitation as the number of required models increases with the number of +standard views. To address this, in this paper, we present a prompt-driven +universal method for view-agnostic echocardiography analysis. Considering the +domain shift between standard views, we first introduce a method called prompt +matching, aimed at learning prompts specific to different views by matching +prompts and querying input embeddings using a pre-trained vision model. Then, +we utilized a pre-trained medical language model to align textual information +with pixel data for accurate segmentation. Extensive experiments on three +standard views showed that our approach significantly outperforms the +state-of-the-art universal methods and achieves comparable or even better +performances over the segmentation model trained and tested on same views. + +
+
+
+
+
+ + ☆ LATUP-Net: A Lightweight 3D Attention U-Net with Parallel Convolutions + for Brain Tumor Segmentation + + +
+ Early-stage 3D brain tumor segmentation from magnetic resonance imaging (MRI) +scans is crucial for prompt and effective treatment. However, this process +faces the challenge of precise delineation due to the tumors' complex +heterogeneity. Moreover, energy sustainability targets and resource +limitations, especially in developing countries, require efficient and +accessible medical imaging solutions. The proposed architecture, a Lightweight +3D ATtention U-Net with Parallel convolutions, LATUP-Net, addresses these +issues. It is specifically designed to reduce computational requirements +significantly while maintaining high segmentation performance. By incorporating +parallel convolutions, it enhances feature representation by capturing +multi-scale information. It further integrates an attention mechanism to refine +segmentation through selective feature recalibration. LATUP-Net achieves +promising segmentation performance: the average Dice scores for the whole +tumor, tumor core, and enhancing tumor on the BraTS2020 dataset are 88.41%, +83.82%, and 73.67%, and on the BraTS2021 dataset, they are 90.29%, 89.54%, and +83.92%, respectively. Hausdorff distance metrics further indicate its improved +ability to delineate tumor boundaries. With its significantly reduced +computational demand using only 3.07 M parameters, about 59 times fewer than +other state-of-the-art models, and running on a single V100 GPU, LATUP-Net +stands out as a promising solution for real-world clinical applications, +particularly in settings with limited resources. Investigations into the +model's interpretability, utilizing gradient-weighted class activation mapping +and confusion matrices, reveal that while attention mechanisms enhance the +segmentation of small regions, their impact is nuanced. Achieving the most +accurate tumor delineation requires carefully balancing local and global +features. + +
+
+
+
+
+ + ☆ Res-U2Net: Untrained Deep Learning for Phase Retrieval and Image + Reconstruction + + +
+ Conventional deep learning-based image reconstruction methods require a large +amount of training data which can be hard to obtain in practice. Untrained deep +learning methods overcome this limitation by training a network to invert a +physical model of the image formation process. Here we present a novel +untrained Res-U2Net model for phase retrieval. We use the extracted phase +information to determine changes in an object's surface and generate a mesh +representation of its 3D structure. We compare the performance of Res-U2Net +phase retrieval against UNet and U2Net using images from the GDXRAY dataset. + +
+
+ comment: 16 pages, 8 figures, 4 Tables +
+
+
+
+
+ + ☆ FlameFinder: Illuminating Obscured Fire through Smoke with Attentive + Deep Metric Learning + + +
+ FlameFinder is a deep metric learning (DML) framework designed to accurately +detect flames, even when obscured by smoke, using thermal images from +firefighter drones during wildfire monitoring. Traditional RGB cameras struggle +in such conditions, but thermal cameras can capture smoke-obscured flame +features. However, they lack absolute thermal reference points, leading to +false positives.To address this issue, FlameFinder utilizes paired thermal-RGB +images for training. By learning latent flame features from smoke-free samples, +the model becomes less biased towards relative thermal gradients. In testing, +it identifies flames in smoky patches by analyzing their equivalent +thermal-domain distribution. This method improves performance using both +supervised and distance-based clustering metrics.The framework incorporates a +flame segmentation method and a DML-aided detection framework. This includes +utilizing center loss (CL), triplet center loss (TCL), and triplet cosine +center loss (TCCL) to identify optimal cluster representatives for +classification. However, the dominance of center loss over the other losses +leads to the model missing features sensitive to them. To address this +limitation, an attention mechanism is proposed. This mechanism allows for +non-uniform feature contribution, amplifying the critical role of cosine and +triplet loss in the DML framework. Additionally, it improves interpretability, +class discrimination, and decreases intra-class variance. As a result, the +proposed model surpasses the baseline by 4.4% in the FLAME2 dataset and 7% in +the FLAME3 dataset for unobscured flame detection accuracy. Moreover, it +demonstrates enhanced class separation in obscured scenarios compared to VGG19, +ResNet18, and three backbone models tailored for flame detection. + +
+
+ comment: Submitted as a Journal Paper to IEEE Transactions on Geoscience and + Remote Sensing +
+
+
+
+
+ + ☆ SAM-I-Am: Semantic Boosting for Zero-shot Atomic-Scale Electron + Micrograph Segmentation + + +
+ Image segmentation is a critical enabler for tasks ranging from medical +diagnostics to autonomous driving. However, the correct segmentation semantics +- where are boundaries located? what segments are logically similar? - change +depending on the domain, such that state-of-the-art foundation models can +generate meaningless and incorrect results. Moreover, in certain domains, +fine-tuning and retraining techniques are infeasible: obtaining labels is +costly and time-consuming; domain images (micrographs) can be exponentially +diverse; and data sharing (for third-party retraining) is restricted. To enable +rapid adaptation of the best segmentation technology, we propose the concept of +semantic boosting: given a zero-shot foundation model, guide its segmentation +and adjust results to match domain expectations. We apply semantic boosting to +the Segment Anything Model (SAM) to obtain microstructure segmentation for +transmission electron microscopy. Our booster, SAM-I-Am, extracts geometric and +textural features of various intermediate masks to perform mask removal and +mask merging operations. We demonstrate a zero-shot performance increase of +(absolute) +21.35%, +12.6%, +5.27% in mean IoU, and a -9.91%, -18.42%, -4.06% +drop in mean false positive masks across images of three difficulty classes +over vanilla SAM (ViT-L). + +
+
+
+
+
+ + ☆ GeoSynth: Contextually-Aware High-Resolution Satellite Image Synthesis + + +
+ We present GeoSynth, a model for synthesizing satellite images with global +style and image-driven layout control. The global style control is via textual +prompts or geographic location. These enable the specification of scene +semantics or regional appearance respectively, and can be used together. We +train our model on a large dataset of paired satellite imagery, with +automatically generated captions, and OpenStreetMap data. We evaluate various +combinations of control inputs, including different types of layout controls. +Results demonstrate that our model can generate diverse, high-quality images +and exhibits excellent zero-shot generalization. The code and model checkpoints +are available at https://github.com/mvrl/GeoSynth. + +
+
+
+
+
+ + ☆ Calibrating Higher-Order Statistics for Few-Shot Class-Incremental + Learning with Pre-trained Vision Transformers CVPR 2024 + + +
+ Few-shot class-incremental learning (FSCIL) aims to adapt the model to new +classes from very few data (5 samples) without forgetting the previously +learned classes. Recent works in many-shot CIL (MSCIL) (using all available +training data) exploited pre-trained models to reduce forgetting and achieve +better plasticity. In a similar fashion, we use ViT models pre-trained on +large-scale datasets for few-shot settings, which face the critical issue of +low plasticity. FSCIL methods start with a many-shot first task to learn a very +good feature extractor and then move to the few-shot setting from the second +task onwards. While the focus of most recent studies is on how to learn the +many-shot first task so that the model generalizes to all future few-shot +tasks, we explore in this work how to better model the few-shot data using +pre-trained models, irrespective of how the first task is trained. Inspired by +recent works in MSCIL, we explore how using higher-order feature statistics can +influence the classification of few-shot classes. We identify the main +challenge of obtaining a good covariance matrix from few-shot data and propose +to calibrate the covariance matrix for new classes based on semantic similarity +to the many-shot base classes. Using the calibrated feature statistics in +combination with existing methods significantly improves few-shot continual +classification on several FSCIL benchmarks. Code is available at +https://github.com/dipamgoswami/FSCIL-Calibration. + +
+
+ comment: Accepted at CLVision workshop (CVPR 2024) +
+
+
+
+
+ + ☆ RoadBEV: Road Surface Reconstruction in Bird's Eye View + + +
+ Road surface conditions, especially geometry profiles, enormously affect +driving performance of autonomous vehicles. Vision-based online road +reconstruction promisingly captures road information in advance. Existing +solutions like monocular depth estimation and stereo matching suffer from +modest performance. The recent technique of Bird's-Eye-View (BEV) perception +provides immense potential to more reliable and accurate reconstruction. This +paper uniformly proposes two simple yet effective models for road elevation +reconstruction in BEV named RoadBEV-mono and RoadBEV-stereo, which estimate +road elevation with monocular and stereo images, respectively. The former +directly fits elevation values based on voxel features queried from image view, +while the latter efficiently recognizes road elevation patterns based on BEV +volume representing discrepancy between left and right voxel features. +Insightful analyses reveal their consistence and difference with perspective +view. Experiments on real-world dataset verify the models' effectiveness and +superiority. Elevation errors of RoadBEV-mono and RoadBEV-stereo achieve 1.83cm +and 0.56cm, respectively. The estimation performance improves by 50\% in BEV +based on monocular image. Our models are promising for practical applications, +providing valuable references for vision-based BEV perception in autonomous +driving. The code is released at https://github.com/ztsrxh/RoadBEV. + +
+
+ comment: Dataset page: https://thu-rsxd.com/rsrd Code: + https://github.com/ztsrxh/RoadBEV +
+
+
+
+
+ + ☆ Spatially Optimized Compact Deep Metric Learning Model for Similarity + Search + + +
+ Spatial optimization is often overlooked in many computer vision tasks. +Filters should be able to recognize the features of an object regardless of +where it is in the image. Similarity search is a crucial task where spatial +features decide an important output. The capacity of convolution to capture +visual patterns across various locations is limited. In contrast to +convolution, the involution kernel is dynamically created at each pixel based +on the pixel value and parameters that have been learned. This study +demonstrates that utilizing a single layer of involution feature extractor +alongside a compact convolution model significantly enhances the performance of +similarity search. Additionally, we improve predictions by using the GELU +activation function rather than the ReLU. The negligible amount of weight +parameters in involution with a compact model with better performance makes the +model very useful in real-world implementations. Our proposed model is below 1 +megabyte in size. We have experimented with our proposed methodology and other +models on CIFAR-10, FashionMNIST, and MNIST datasets. Our proposed method +outperforms across all three datasets. + +
+
+ comment: 5 pages, 3 figures, +
+
+
+
+
+ + ☆ Leveraging Latents for Efficient Thermography Classification and + Segmentation + + +
+ Breast cancer is a prominent health concern worldwide, currently being the +secondmost common and second-deadliest type of cancer in women. While current +breast cancer diagnosis mainly relies on mammography imaging, in recent years +the use of thermography for breast cancer imaging has been garnering growing +popularity. Thermographic imaging relies on infrared cameras to capture +body-emitted heat distributions. While these heat signatures have proven useful +for computer-vision systems for accurate breast cancer segmentation and +classification, prior work often relies on handcrafted feature engineering or +complex architectures, potentially limiting the comparability and applicability +of these methods. In this work, we present a novel algorithm for both breast +cancer classification and segmentation. Rather than focusing efforts on manual +feature and architecture engineering, our algorithm focuses on leveraging an +informative, learned feature space, thus making our solution simpler to use and +extend to other frameworks and downstream tasks, as well as more applicable to +data-scarce settings. Our classification produces SOTA results, while we are +the first work to produce segmentation regions studied in this paper. + +
+
+
+
+
+ + ☆ MambaAD: Exploring State Space Models for Multi-class Unsupervised + Anomaly Detection + + +
+ Recent advancements in anomaly detection have seen the efficacy of CNN- and +transformer-based approaches. However, CNNs struggle with long-range +dependencies, while transformers are burdened by quadratic computational +complexity. Mamba-based models, with their superior long-range modeling and +linear efficiency, have garnered substantial attention. This study pioneers the +application of Mamba to multi-class unsupervised anomaly detection, presenting +MambaAD, which consists of a pre-trained encoder and a Mamba decoder featuring +Locality-Enhanced State Space (LSS) modules at multi-scales. The proposed LSS +module, integrating parallel cascaded (Hybrid State Space) HSS blocks and +multi-kernel convolutions operations, effectively captures both long-range and +local information. The HSS block, utilizing (Hybrid Scanning) HS encoders, +encodes feature maps into five scanning methods and eight directions, thereby +strengthening global connections through the (State Space Model) SSM. The use +of Hilbert scanning and eight directions significantly improves feature +sequence modeling. Comprehensive experiments on six diverse anomaly detection +datasets and seven metrics demonstrate SoTA performance, substantiating the +method's effectiveness. + +
+
+
+
+
+ + ☆ The Impact of Print-and-Scan in Heterogeneous Morph Evaluation Scenarios + + +
+ Face morphing attacks present an emerging threat to the face recognition +system. On top of that, printing and scanning the morphed images could obscure +the artifacts generated during the morphing process, which makes morphed image +detection even harder. In this work, we investigate the impact that printing +and scanning has on morphing attacks through a series of heterogeneous tests. +Our experiments show that we can increase the possibility of a false match by +up to 5.64% for DiM and 16.00% for StyleGAN2 when providing an image that has +been printed and scanned, regardless it is morphed or bona fide, to a Face +Recognition (FR) system. Likewise, using Frechet Inception Distance (FID) +metric, strictly print-scanned morph attacks performed on average 9.185% +stronger than non-print-scanned digital morphs. + +
+
+ comment: Initial preprint. Under review +
+
+
+
+
+ + ☆ Training-Free Open-Vocabulary Segmentation with Offline + Diffusion-Augmented Prototype Generation CVPR 2024 + + +
+ Open-vocabulary semantic segmentation aims at segmenting arbitrary categories +expressed in textual form. Previous works have trained over large amounts of +image-caption pairs to enforce pixel-level multimodal alignments. However, +captions provide global information about the semantics of a given image but +lack direct localization of individual concepts. Further, training on +large-scale datasets inevitably brings significant computational costs. In this +paper, we propose FreeDA, a training-free diffusion-augmented method for +open-vocabulary semantic segmentation, which leverages the ability of diffusion +models to visually localize generated concepts and local-global similarities to +match class-agnostic regions with semantic classes. Our approach involves an +offline stage in which textual-visual reference embeddings are collected, +starting from a large set of captions and leveraging visual and semantic +contexts. At test time, these are queried to support the visual matching +process, which is carried out by jointly considering class-agnostic regions and +global semantic similarities. Extensive analyses demonstrate that FreeDA +achieves state-of-the-art performance on five datasets, surpassing previous +methods by more than 7.0 average points in terms of mIoU and without requiring +any training. + +
+
+ comment: CVPR 2024. Project page: https://aimagelab.github.io/freeda/ +
+
+
+
+
+ + ♻ ☆ Zero-shot Referring Expression Comprehension via Structural Similarity + Between Images and Captions CVPR 2024 + + +
+ Zero-shot referring expression comprehension aims at localizing bounding +boxes in an image corresponding to provided textual prompts, which requires: +(i) a fine-grained disentanglement of complex visual scene and textual context, +and (ii) a capacity to understand relationships among disentangled entities. +Unfortunately, existing large vision-language alignment (VLA) models, e.g., +CLIP, struggle with both aspects so cannot be directly used for this task. To +mitigate this gap, we leverage large foundation models to disentangle both +images and texts into triplets in the format of (subject, predicate, object). +After that, grounding is accomplished by calculating the structural similarity +matrix between visual and textual triplets with a VLA model, and subsequently +propagate it to an instance-level similarity matrix. Furthermore, to equip VLA +models with the ability of relationship understanding, we design a +triplet-matching objective to fine-tune the VLA models on a collection of +curated dataset containing abundant entity relationships. Experiments +demonstrate that our visual grounding performance increase of up to 19.5% over +the SOTA zero-shot model on RefCOCO/+/g. On the more challenging Who's Waldo +dataset, our zero-shot approach achieves comparable accuracy to the fully +supervised model. Code is available at +https://github.com/Show-han/Zeroshot_REC. + +
+
+ comment: CVPR 2024, Code available at https://github.com/Show-han/Zeroshot_REC +
+
+
+
+
+ + ♻ ☆ Multi-person 3D pose estimation from unlabelled data + + +
+ Its numerous applications make multi-human 3D pose estimation a remarkably +impactful area of research. Nevertheless, assuming a multiple-view system +composed of several regular RGB cameras, 3D multi-pose estimation presents +several challenges. First of all, each person must be uniquely identified in +the different views to separate the 2D information provided by the cameras. +Secondly, the 3D pose estimation process from the multi-view 2D information of +each person must be robust against noise and potential occlusions in the +scenario. In this work, we address these two challenges with the help of deep +learning. Specifically, we present a model based on Graph Neural Networks +capable of predicting the cross-view correspondence of the people in the +scenario along with a Multilayer Perceptron that takes the 2D points to yield +the 3D poses of each person. These two models are trained in a self-supervised +manner, thus avoiding the need for large datasets with 3D annotations. + +
+
+
+
+
+ + ♻ ☆ Influencer Backdoor Attack on Semantic Segmentation + + +
+ When a small number of poisoned samples are injected into the training +dataset of a deep neural network, the network can be induced to exhibit +malicious behavior during inferences, which poses potential threats to +real-world applications. While they have been intensively studied in +classification, backdoor attacks on semantic segmentation have been largely +overlooked. Unlike classification, semantic segmentation aims to classify every +pixel within a given image. In this work, we explore backdoor attacks on +segmentation models to misclassify all pixels of a victim class by injecting a +specific trigger on non-victim pixels during inferences, which is dubbed +Influencer Backdoor Attack (IBA). IBA is expected to maintain the +classification accuracy of non-victim pixels and mislead classifications of all +victim pixels in every single inference and could be easily applied to +real-world scenes. Based on the context aggregation ability of segmentation +models, we proposed a simple, yet effective, Nearest-Neighbor trigger injection +strategy. We also introduce an innovative Pixel Random Labeling strategy which +maintains optimal performance even when the trigger is placed far from the +victim pixels. Our extensive experiments reveal that current segmentation +models do suffer from backdoor attacks, demonstrate IBA real-world +applicability, and show that our proposed techniques can further increase +attack performance. + +
+
+
+
+
+ + ♻ ☆ An Edit Friendly DDPM Noise Space: Inversion and Manipulations CVPR 2024 + + +
+ Denoising diffusion probabilistic models (DDPMs) employ a sequence of white +Gaussian noise samples to generate an image. In analogy with GANs, those noise +maps could be considered as the latent code associated with the generated +image. However, this native noise space does not possess a convenient +structure, and is thus challenging to work with in editing tasks. Here, we +propose an alternative latent noise space for DDPM that enables a wide range of +editing operations via simple means, and present an inversion method for +extracting these edit-friendly noise maps for any given image (real or +synthetically generated). As opposed to the native DDPM noise space, the +edit-friendly noise maps do not have a standard normal distribution and are not +statistically independent across timesteps. However, they allow perfect +reconstruction of any desired image, and simple transformations on them +translate into meaningful manipulations of the output image (e.g. shifting, +color edits). Moreover, in text-conditional models, fixing those noise maps +while changing the text prompt, modifies semantics while retaining structure. +We illustrate how this property enables text-based editing of real images via +the diverse DDPM sampling scheme (in contrast to the popular non-diverse DDIM +inversion). We also show how it can be used within existing diffusion-based +editing methods to improve their quality and diversity. Webpage: +https://inbarhub.github.io/DDPM_inversion + +
+
+ comment: CVPR 2024. Code and examples are available at + https://github.com/inbarhub/DDPM_inversion +
+
+
+
+
+ + ♻ ☆ Event Data Association via Robust Model Fitting for Event-based Object + Tracking + + +
+ Event-based approaches, which are based on bio-inspired asynchronous event +cameras, have achieved promising performance on various computer vision tasks. +However, the study of the fundamental event data association problem is still +in its infancy. In this paper, we propose a novel Event Data Association +(called EDA) approach to explicitly address the event association and fusion +problem. The proposed EDA seeks for event trajectories that best fit the event +data, in order to perform unifying data association and information fusion. In +EDA, we first asynchronously fuse the event data based on its information +entropy. Then, we introduce a deterministic model hypothesis generation +strategy, which effectively generates model hypotheses from the fused events, +to represent the corresponding event trajectories. After that, we present a +two-stage weighting algorithm, which robustly weighs and selects true models +from the generated model hypotheses, through multi-structural geometric model +fitting. Meanwhile, we also propose an adaptive model selection strategy to +automatically determine the number of the true models. Finally, we use the +selected true models to associate and fuse the event data, without being +affected by sensor noise and irrelevant structures. We evaluate the performance +of the proposed EDA on the object tracking task. The experimental results show +the effectiveness of EDA under challenging scenarios, such as high speed, +motion blur, and high dynamic range conditions. + +
+
+ comment: 32 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ A Spatio-temporal Aligned SUNet Model for Low-light Video Enhancement + + +
+ Distortions caused by low-light conditions are not only visually unpleasant +but also degrade the performance of computer vision tasks. The restoration and +enhancement have proven to be highly beneficial. However, there are only a +limited number of enhancement methods explicitly designed for videos acquired +in low-light conditions. We propose a Spatio-Temporal Aligned SUNet (STA-SUNet) +model using a Swin Transformer as a backbone to capture low light video +features and exploit their spatio-temporal correlations. The STA-SUNet model is +trained on a novel, fully registered dataset (BVI), which comprises dynamic +scenes captured under varying light conditions. It is further analysed +comparatively against various other models over three test datasets. The model +demonstrates superior adaptivity across all datasets, obtaining the highest +PSNR and SSIM values. It is particularly effective in extreme low-light +conditions, yielding fairly good visualisation results. + +
+
+
+
+
+ + ♻ ☆ DIAGNOSIS: Detecting Unauthorized Data Usages in Text-to-image Diffusion + Models ICLR 2024 + + +
+ Recent text-to-image diffusion models have shown surprising performance in +generating high-quality images. However, concerns have arisen regarding the +unauthorized data usage during the training or fine-tuning process. One example +is when a model trainer collects a set of images created by a particular artist +and attempts to train a model capable of generating similar images without +obtaining permission and giving credit to the artist. To address this issue, we +propose a method for detecting such unauthorized data usage by planting the +injected memorization into the text-to-image diffusion models trained on the +protected dataset. Specifically, we modify the protected images by adding +unique contents on these images using stealthy image warping functions that are +nearly imperceptible to humans but can be captured and memorized by diffusion +models. By analyzing whether the model has memorized the injected content +(i.e., whether the generated images are processed by the injected +post-processing function), we can detect models that had illegally utilized the +unauthorized data. Experiments on Stable Diffusion and VQ Diffusion with +different model training or fine-tuning methods (i.e, LoRA, DreamBooth, and +standard training) demonstrate the effectiveness of our proposed method in +detecting unauthorized data usages. Code: +https://github.com/ZhentingWang/DIAGNOSIS. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ DiffusionLight: Light Probes for Free by Painting a Chrome Ball CVPR 2024 + + +
+ We present a simple yet effective technique to estimate lighting in a single +input image. Current techniques rely heavily on HDR panorama datasets to train +neural networks to regress an input with limited field-of-view to a full +environment map. However, these approaches often struggle with real-world, +uncontrolled settings due to the limited diversity and size of their datasets. +To address this problem, we leverage diffusion models trained on billions of +standard images to render a chrome ball into the input image. Despite its +simplicity, this task remains challenging: the diffusion models often insert +incorrect or inconsistent objects and cannot readily generate images in HDR +format. Our research uncovers a surprising relationship between the appearance +of chrome balls and the initial diffusion noise map, which we utilize to +consistently generate high-quality chrome balls. We further fine-tune an LDR +diffusion model (Stable Diffusion XL) with LoRA, enabling it to perform +exposure bracketing for HDR light estimation. Our method produces convincing +light estimates across diverse settings and demonstrates superior +generalization to in-the-wild scenarios. + +
+
+ comment: CVPR 2024 Oral. For more information and code, please visit our + website https://diffusionlight.github.io/ +
+
+
+
+
+ + ♻ ☆ Learning Local and Global Temporal Contexts for Video Semantic + Segmentation TPAMI + + +
+ Contextual information plays a core role for video semantic segmentation +(VSS). This paper summarizes contexts for VSS in two-fold: local temporal +contexts (LTC) which define the contexts from neighboring frames, and global +temporal contexts (GTC) which represent the contexts from the whole video. As +for LTC, it includes static and motional contexts, corresponding to static and +moving content in neighboring frames, respectively. Previously, both static and +motional contexts have been studied. However, there is no research about +simultaneously learning static and motional contexts (highly complementary). +Hence, we propose a Coarse-to-Fine Feature Mining (CFFM) technique to learn a +unified presentation of LTC. CFFM contains two parts: Coarse-to-Fine Feature +Assembling (CFFA) and Cross-frame Feature Mining (CFM). CFFA abstracts static +and motional contexts, and CFM mines useful information from nearby frames to +enhance target features. To further exploit more temporal contexts, we propose +CFFM++ by additionally learning GTC from the whole video. Specifically, we +uniformly sample certain frames from the video and extract global contextual +prototypes by k-means. The information within those prototypes is mined by CFM +to refine target features. Experimental results on popular benchmarks +demonstrate that CFFM and CFFM++ perform favorably against state-of-the-art +methods. Our code is available at https://github.com/GuoleiSun/VSS-CFFM + +
+
+ comment: Accepted to TPAMI, an extended version of a paper published in CVPR + 2022 +
+
+
+
+
+ + ♻ ☆ SGV3D:Towards Scenario Generalization for Vision-based Roadside 3D + Object Detection + + +
+ Roadside perception can greatly increase the safety of autonomous vehicles by +extending their perception ability beyond the visual range and addressing blind +spots. However, current state-of-the-art vision-based roadside detection +methods possess high accuracy on labeled scenes but have inferior performance +on new scenes. This is because roadside cameras remain stationary after +installation and can only collect data from a single scene, resulting in the +algorithm overfitting these roadside backgrounds and camera poses. To address +this issue, in this paper, we propose an innovative Scenario Generalization +Framework for Vision-based Roadside 3D Object Detection, dubbed SGV3D. +Specifically, we employ a Background-suppressed Module (BSM) to mitigate +background overfitting in vision-centric pipelines by attenuating background +features during the 2D to bird's-eye-view projection. Furthermore, by +introducing the Semi-supervised Data Generation Pipeline (SSDG) using unlabeled +images from new scenes, diverse instance foregrounds with varying camera poses +are generated, addressing the risk of overfitting specific camera poses. We +evaluate our method on two large-scale roadside benchmarks. Our method +surpasses all previous methods by a significant margin in new scenes, including ++42.57% for vehicle, +5.87% for pedestrian, and +14.89% for cyclist compared to +BEVHeight on the DAIR-V2X-I heterologous benchmark. On the larger-scale Rope3D +heterologous benchmark, we achieve notable gains of 14.48% for car and 12.41% +for large vehicle. We aspire to contribute insights on the exploration of +roadside perception techniques, emphasizing their capability for scenario +generalization. The code will be available at +https://github.com/yanglei18/SGV3D + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Are We on the Right Way for Evaluating Large Vision-Language Models? + + +
+ Large vision-language models (LVLMs) have recently achieved rapid progress, +sparking numerous studies to evaluate their multi-modal capabilities. However, +we dig into current evaluation works and identify two primary issues: 1) Visual +content is unnecessary for many samples. The answers can be directly inferred +from the questions and options, or the world knowledge embedded in LLMs. This +phenomenon is prevalent across current benchmarks. For instance, GeminiPro +achieves 42.9% on the MMMU benchmark without any visual input, and outperforms +the random choice baseline across six benchmarks over 24% on average. 2) +Unintentional data leakage exists in LLM and LVLM training. LLM and LVLM could +still answer some visual-necessary questions without visual content, indicating +the memorizing of these samples within large-scale training data. For example, +Sphinx-X-MoE gets 43.6% on MMMU without accessing images, surpassing its LLM +backbone with 17.9%. Both problems lead to misjudgments of actual multi-modal +gains and potentially misguide the study of LVLM. To this end, we present +MMStar, an elite vision-indispensable multi-modal benchmark comprising 1,500 +samples meticulously selected by humans. MMStar benchmarks 6 core capabilities +and 18 detailed axes, aiming to evaluate LVLMs' multi-modal capacities with +carefully balanced and purified samples. These samples are first roughly +selected from current benchmarks with an automated pipeline, human review is +then involved to ensure each curated sample exhibits visual dependency, minimal +data leakage, and requires advanced multi-modal capabilities. Moreover, two +metrics are developed to measure data leakage and actual performance gain in +multi-modal training. We evaluate 16 leading LVLMs on MMStar to assess their +multi-modal capabilities, and on 7 benchmarks with the proposed metrics to +investigate their data leakage and actual multi-modal gain. + +
+
+ comment: Project page: https://mmstar-benchmark.github.io/ +
+
+
+
+
+ + ♻ ☆ CN-RMA: Combined Network with Ray Marching Aggregation for 3D Indoors + Object Detection from Multi-view Images CVPR2024 + + +
+ This paper introduces CN-RMA, a novel approach for 3D indoor object detection +from multi-view images. We observe the key challenge as the ambiguity of image +and 3D correspondence without explicit geometry to provide occlusion +information. To address this issue, CN-RMA leverages the synergy of 3D +reconstruction networks and 3D object detection networks, where the +reconstruction network provides a rough Truncated Signed Distance Function +(TSDF) and guides image features to vote to 3D space correctly in an end-to-end +manner. Specifically, we associate weights to sampled points of each ray +through ray marching, representing the contribution of a pixel in an image to +corresponding 3D locations. Such weights are determined by the predicted signed +distances so that image features vote only to regions near the reconstructed +surface. Our method achieves state-of-the-art performance in 3D object +detection from multi-view images, as measured by mAP@0.25 and mAP@0.5 on the +ScanNet and ARKitScenes datasets. The code and models are released at +https://github.com/SerCharles/CN-RMA. + +
+
+ comment: CVPR2024 poster paper, 8 pages of main part, and 4 pages of + supplementary material +
+
+
+
+
+ + ♻ ☆ MetaMix: Meta-state Precision Searcher for Mixed-precision Activation + Quantization + + +
+ Mixed-precision quantization of efficient networks often suffer from +activation instability encountered in the exploration of bit selections. To +address this problem, we propose a novel method called MetaMix which consists +of bit selection and weight training phases. The bit selection phase iterates +two steps, (1) the mixed-precision-aware weight update, and (2) the bit-search +training with the fixed mixed-precision-aware weights, both of which combined +reduce activation instability in mixed-precision quantization and contribute to +fast and high-quality bit selection. The weight training phase exploits the +weights and step sizes trained in the bit selection phase and fine-tunes them +thereby offering fast training. Our experiments with efficient and +hard-to-quantize networks, i.e., MobileNet v2 and v3, and ResNet-18 on ImageNet +show that our proposed method pushes the boundary of mixed-precision +quantization, in terms of accuracy vs. operations, by outperforming both mixed- +and single-precision SOTA methods. + +
+
+ comment: Proc. The 38th Annual AAAI Conference on Artificial Intelligence + (AAAI) +
+
+
+
+
+ + ♻ ☆ UltraLight VM-UNet: Parallel Vision Mamba Significantly Reduces + Parameters for Skin Lesion Segmentation + + +
+ Traditionally for improving the segmentation performance of models, most +approaches prefer to use adding more complex modules. And this is not suitable +for the medical field, especially for mobile medical devices, where +computationally loaded models are not suitable for real clinical environments +due to computational resource constraints. Recently, state-space models (SSMs), +represented by Mamba, have become a strong competitor to traditional CNNs and +Transformers. In this paper, we deeply explore the key elements of parameter +influence in Mamba and propose an UltraLight Vision Mamba UNet (UltraLight +VM-UNet) based on this. Specifically, we propose a method for processing +features in parallel Vision Mamba, named PVM Layer, which achieves excellent +performance with the lowest computational load while keeping the overall number +of processing channels constant. We conducted comparisons and ablation +experiments with several state-of-the-art lightweight models on three skin +lesion public datasets and demonstrated that the UltraLight VM-UNet exhibits +the same strong performance competitiveness with parameters of only 0.049M and +GFLOPs of 0.060. In addition, this study deeply explores the key elements of +parameter influence in Mamba, which will lay a theoretical foundation for Mamba +to possibly become a new mainstream module for lightweighting in the future. +The code is available from https://github.com/wurenkai/UltraLight-VM-UNet . + +
+
+
+
+
+ + ♻ ☆ Cross-Silo Federated Learning Across Divergent Domains with Iterative + Parameter Alignment + + +
+ Learning from the collective knowledge of data dispersed across private +sources can provide neural networks with enhanced generalization capabilities. +Federated learning, a method for collaboratively training a machine learning +model across remote clients, achieves this by combining client models via the +orchestration of a central server. However, current approaches face two +critical limitations: i) they struggle to converge when client domains are +sufficiently different, and ii) current aggregation techniques produce an +identical global model for each client. In this work, we address these issues +by reformulating the typical federated learning setup: rather than learning a +single global model, we learn N models each optimized for a common objective. +To achieve this, we apply a weighted distance minimization to model parameters +shared in a peer-to-peer topology. The resulting framework, Iterative Parameter +Alignment, applies naturally to the cross-silo setting, and has the following +properties: (i) a unique solution for each participant, with the option to +globally converge each model in the federation, and (ii) an optional +early-stopping mechanism to elicit fairness among peers in collaborative +learning settings. These characteristics jointly provide a flexible new +framework for iteratively learning from peer models trained on disparate +datasets. We find that the technique achieves competitive results on a variety +of data partitions compared to state-of-the-art approaches. Further, we show +that the method is robust to divergent domains (i.e. disjoint classes across +peers) where existing approaches struggle. + +
+
+ comment: Published at IEEE Big Data 2023 +
+
+
+
+
+ + ♻ ☆ Coarse-to-Fine Latent Diffusion for Pose-Guided Person Image Synthesis CVPR 2024 + + +
+ Diffusion model is a promising approach to image generation and has been +employed for Pose-Guided Person Image Synthesis (PGPIS) with competitive +performance. While existing methods simply align the person appearance to the +target pose, they are prone to overfitting due to the lack of a high-level +semantic understanding on the source person image. In this paper, we propose a +novel Coarse-to-Fine Latent Diffusion (CFLD) method for PGPIS. In the absence +of image-caption pairs and textual prompts, we develop a novel training +paradigm purely based on images to control the generation process of a +pre-trained text-to-image diffusion model. A perception-refined decoder is +designed to progressively refine a set of learnable queries and extract +semantic understanding of person images as a coarse-grained prompt. This allows +for the decoupling of fine-grained appearance and pose information controls at +different stages, and thus circumventing the potential overfitting problem. To +generate more realistic texture details, a hybrid-granularity attention module +is proposed to encode multi-scale fine-grained appearance features as bias +terms to augment the coarse-grained prompt. Both quantitative and qualitative +experimental results on the DeepFashion benchmark demonstrate the superiority +of our method over the state of the arts for PGPIS. Code is available at +https://github.com/YanzuoLu/CFLD. + +
+
+ comment: Accepted by CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ One-Step Late Fusion Multi-view Clustering with Compressed Subspace + + +
+ Late fusion multi-view clustering (LFMVC) has become a rapidly growing class +of methods in the multi-view clustering (MVC) field, owing to its excellent +computational speed and clustering performance. One bottleneck faced by +existing late fusion methods is that they are usually aligned to the average +kernel function, which makes the clustering performance highly dependent on the +quality of datasets. Another problem is that they require subsequent k-means +clustering after obtaining the consensus partition matrix to get the final +discrete labels, and the resulting separation of the label learning and cluster +structure optimization processes limits the integrity of these models. To +address the above issues, we propose an integrated framework named One-Step +Late Fusion Multi-view Clustering with Compressed Subspace (OS-LFMVC-CS). +Specifically, we use the consensus subspace to align the partition matrix while +optimizing the partition fusion, and utilize the fused partition matrix to +guide the learning of discrete labels. A six-step iterative optimization +approach with verified convergence is proposed. Sufficient experiments on +multiple datasets validate the effectiveness and efficiency of our proposed +method. + +
+
+ comment: Accepted by ICASSP2024 +
+
+
+
+
+ + ♻ ☆ Deepfake Generation and Detection: A Benchmark and Survey + + +
+ In addition to the advancements in deepfake generation, corresponding +detection technologies need to continuously evolve to regulate the potential +misuse of deepfakes, such as for privacy invasion and phishing attacks. This +survey comprehensively reviews the latest developments in deepfake generation +and detection, summarizing and analyzing the current state of the art in this +rapidly evolving field. We first unify task definitions, comprehensively +introduce datasets and metrics, and discuss the development of generation and +detection technology frameworks. Then, we discuss the development of several +related sub-fields and focus on researching four mainstream deepfake fields: +popular face swap, face reenactment, talking face generation, and facial +attribute editing, as well as foreign detection. Subsequently, we +comprehensively benchmark representative methods on popular datasets for each +field, fully evaluating the latest and influential works published in top +conferences/journals. Finally, we analyze the challenges and future research +directions of the discussed fields. We closely follow the latest developments +in https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection. + +
+
+
+
+
+ + ♻ ☆ MultIOD: Rehearsal-free Multihead Incremental Object Detector CVPR 2024 + + +
+ Class-Incremental learning (CIL) refers to the ability of artificial agents +to integrate new classes as they appear in a stream. It is particularly +interesting in evolving environments where agents have limited access to memory +and computational resources. The main challenge of incremental learning is +catastrophic forgetting, the inability of neural networks to retain past +knowledge when learning a new one. Unfortunately, most existing +class-incremental methods for object detection are applied to two-stage +algorithms such as Faster-RCNN, and rely on rehearsal memory to retain past +knowledge. We argue that those are not suitable in resource-limited +environments, and more effort should be dedicated to anchor-free and +rehearsal-free object detection. In this paper, we propose MultIOD, a +class-incremental object detector based on CenterNet. Our contributions are: +(1) we propose a multihead feature pyramid and multihead detection architecture +to efficiently separate class representations, (2) we employ transfer learning +between classes learned initially and those learned incrementally to tackle +catastrophic forgetting, and (3) we use a class-wise non-max-suppression as a +post-processing technique to remove redundant boxes. Results show that our +method outperforms state-of-the-art methods on two Pascal VOC datasets, while +only saving the model in its current state, contrary to other +distillation-based counterparts. + +
+
+ comment: Accepted at the archival track of the Workshop on Continual Learning + in Computer Vision (CVPR 2024) +
+
+
+
+
+ + ♻ ☆ BlockFusion: Expandable 3D Scene Generation using Latent Tri-plane + Extrapolation + + +
+ We present BlockFusion, a diffusion-based model that generates 3D scenes as +unit blocks and seamlessly incorporates new blocks to extend the scene. +BlockFusion is trained using datasets of 3D blocks that are randomly cropped +from complete 3D scene meshes. Through per-block fitting, all training blocks +are converted into the hybrid neural fields: with a tri-plane containing the +geometry features, followed by a Multi-layer Perceptron (MLP) for decoding the +signed distance values. A variational auto-encoder is employed to compress the +tri-planes into the latent tri-plane space, on which the denoising diffusion +process is performed. Diffusion applied to the latent representations allows +for high-quality and diverse 3D scene generation. To expand a scene during +generation, one needs only to append empty blocks to overlap with the current +scene and extrapolate existing latent tri-planes to populate new blocks. The +extrapolation is done by conditioning the generation process with the feature +samples from the overlapping tri-planes during the denoising iterations. Latent +tri-plane extrapolation produces semantically and geometrically meaningful +transitions that harmoniously blend with the existing scene. A 2D layout +conditioning mechanism is used to control the placement and arrangement of +scene elements. Experimental results indicate that BlockFusion is capable of +generating diverse, geometrically consistent and unbounded large 3D scenes with +unprecedented high-quality shapes in both indoor and outdoor scenarios. + +
+
+ comment: Video: https://www.youtube.com/watch?v=PxIBtd6G0mA +
+
+
+
+
+ + ♻ ☆ Learning Zero-Shot Material States Segmentation, by Implanting Natural + Image Patterns in Synthetic Data + + +
+ Visual understanding and segmentation of materials and their states is +fundamental to understanding the physical world. The myriad textures, shapes, +and often blurry boundaries formed by materials make this task particularly +hard to generalize. Whether it's identifying wet regions of a surface, minerals +in rocks, infected regions in plants, or pollution in water, each material +state has its own unique form. For neural nets to learn general class-agnostic +material segmentation, it is necessary to first collect and annotate data that +captures this complexity. Collecting and manually annotating real-world images +is limited by the cost and precision of manual labor. In contrast, synthetic +CGI data is highly accurate and almost cost-free, but fails to replicate the +vast diversity of the material world. This work offers a method to bridge this +crucial gap by implanting patterns extracted from real-world images in +synthetic data. Hence, patterns automatically collected from natural images are +used to map materials into synthetic scenes. This unsupervised approach allows +the generated data to capture the vast complexity of the real world while +maintaining the precision and scale of synthetic data. We also present the +first general benchmark for zero-shot material state segmentation. The +benchmark contains a wide range of real-world images of material states, like +food, rocks, construction, plants, liquids, and many others, each in various +states (wet/dry/stained/cooked/burned/worn/rusted/sediment/foam, etc.). The +annotation includes both partial similarity between regions with similar but +not identical materials, and hard segmentation of only points in the exact same +material state. We show that net trains on MatSeg significantly outperform +existing state-of-the-art methods on this task. The dataset, code, and trained +model are available + +
+
+
+
+
+ + ♻ ☆ Improved Probabilistic Image-Text Representations ICLR 2024 + + +
+ Image-Text Matching (ITM) task, a fundamental vision-language (VL) task, +suffers from the inherent ambiguity arising from multiplicity and imperfect +annotations. Deterministic functions are not sufficiently powerful to capture +ambiguity, prompting the exploration of probabilistic embeddings to tackle the +challenge. However, the existing probabilistic ITM approach encounters two key +shortcomings; the burden of heavy computations due to the Monte Carlo +approximation, and the loss saturation issue in the face of abundant false +negatives. To overcome the issues, this paper presents an improved +Probabilistic Cross-Modal Embeddings (named PCME++) by introducing a new +probabilistic distance with a closed-form solution. In addition, two +optimization techniques are proposed to enhance PCME++ further: first, the +incorporation of pseudo-positives to prevent the negative effect under massive +false negatives; second, mixed sample data augmentation for probabilistic +matching. Experimental results on MS-COCO Caption and two extended benchmarks, +CxC and ECCV Caption, demonstrate the effectiveness of PCME++ compared to +state-of-the-art ITM methods. The robustness of PCME++ is also evaluated under +noisy image-text correspondences. In addition, the potential applicability of +PCME++ in automatic prompt-filtering for zero-shot classification is shown. The +code is available at https://github.com/naver-ai/pcmepp + +
+
+ comment: ICLR 2024 camera-ready; Code: https://github.com/naver-ai/pcmepp. + Project page: https://naver-ai.github.io/pcmepp/. 30 pages, 2.2 MB +
+
+
+
+
+ + ♻ ☆ Industrial Application of 6D Pose Estimation for Robotic Manipulation in + Automotive Internal Logistics + + +
+ Despite the advances in robotics a large proportion of the of parts handling +tasks in the automotive industry's internal logistics are not automated but +still performed by humans. A key component to competitively automate these +processes is a 6D pose estimation that can handle a large number of different +parts, is adaptable to new parts with little manual effort, and is sufficiently +accurate and robust with respect to industry requirements. In this context, the +question arises as to the current status quo with respect to these measures. To +address this we built a representative 6D pose estimation pipeline with +state-of-the-art components from economically scalable real to synthetic data +generation to pose estimators and evaluated it on automotive parts with regards +to a realistic sequencing process. We found that using the data generation +approaches, the performance of the trained 6D pose estimators are promising, +but do not meet industry requirements. We reveal that the reason for this is +the inability of the estimators to provide reliable uncertainties for their +poses, rather than the ability of to provide sufficiently accurate poses. In +this context we further analyzed how RGB- and RGB-D-based approaches compare +against this background and show that they are differently vulnerable to the +domain gap induced by synthetic data. + +
+
+ comment: Accepted for publication at IEEE International Conference on + Automation Science and Engineering (CASE 2023) +
+
+
+
+
+ + ♻ ☆ Self-training via Metric Learning for Source-Free Domain Adaptation of + Semantic Segmentation + + +
+ Unsupervised source-free domain adaptation methods aim to train a model for +the target domain utilizing a pretrained source-domain model and unlabeled +target-domain data, particularly when accessibility to source data is +restricted due to intellectual property or privacy concerns. Traditional +methods usually use self-training with pseudo-labeling, which is often +subjected to thresholding based on prediction confidence. However, such +thresholding limits the effectiveness of self-training due to insufficient +supervision. This issue becomes more severe in a source-free setting, where +supervision comes solely from the predictions of the pre-trained source model. +In this study, we propose a novel approach by incorporating a mean-teacher +model, wherein the student network is trained using all predictions from the +teacher network. Instead of employing thresholding on predictions, we introduce +a method to weight the gradients calculated from pseudo-labels based on the +reliability of the teacher's predictions. To assess reliability, we introduce a +novel approach using proxy-based metric learning. Our method is evaluated in +synthetic-to-real and cross-city scenarios, demonstrating superior performance +compared to existing state-of-the-art methods. + +
+
+ comment: This paper is under consideration at Computer Vision and Image + Understanding +
+
+
+
+
+ + ♻ ☆ Fine-grained Action Analysis: A Multi-modality and Multi-task Dataset of + Figure Skating + + +
+ The fine-grained action analysis of the existing action datasets is +challenged by insufficient action categories, low fine granularities, limited +modalities, and tasks. In this paper, we propose a Multi-modality and +Multi-task dataset of Figure Skating (MMFS) which was collected from the World +Figure Skating Championships. MMFS, which possesses action recognition and +action quality assessment, captures RGB, skeleton, and is collected the score +of actions from 11671 clips with 256 categories including spatial and temporal +labels. The key contributions of our dataset fall into three aspects as +follows. (1) Independently spatial and temporal categories are first proposed +to further explore fine-grained action recognition and quality assessment. (2) +MMFS first introduces the skeleton modality for complex fine-grained action +quality assessment. (3) Our multi-modality and multi-task dataset encourage +more action analysis models. To benchmark our dataset, we adopt RGB-based and +skeleton-based baseline methods for action recognition and action quality +assessment. + +
+
+
+
+
+ + ♻ ☆ Co-Occ: Coupling Explicit Feature Fusion with Volume Rendering + Regularization for Multi-Modal 3D Semantic Occupancy Prediction + + +
+ 3D semantic occupancy prediction is a pivotal task in the field of autonomous +driving. Recent approaches have made great advances in 3D semantic occupancy +predictions on a single modality. However, multi-modal semantic occupancy +prediction approaches have encountered difficulties in dealing with the +modality heterogeneity, modality misalignment, and insufficient modality +interactions that arise during the fusion of different modalities data, which +may result in the loss of important geometric and semantic information. This +letter presents a novel multi-modal, i.e., LiDAR-camera 3D semantic occupancy +prediction framework, dubbed Co-Occ, which couples explicit LiDAR-camera +feature fusion with implicit volume rendering regularization. The key insight +is that volume rendering in the feature space can proficiently bridge the gap +between 3D LiDAR sweeps and 2D images while serving as a physical +regularization to enhance LiDAR-camera fused volumetric representation. +Specifically, we first propose a Geometric- and Semantic-aware Fusion +(GSFusion) module to explicitly enhance LiDAR features by incorporating +neighboring camera features through a K-nearest neighbors (KNN) search. Then, +we employ volume rendering to project the fused feature back to the image +planes for reconstructing color and depth maps. These maps are then supervised +by input images from the camera and depth estimations derived from LiDAR, +respectively. Extensive experiments on the popular nuScenes and SemanticKITTI +benchmarks verify the effectiveness of our Co-Occ for 3D semantic occupancy +prediction. The project page is available at +https://rorisis.github.io/Co-Occ_project-page/. + +
+
+
+
+
+ + ♻ ☆ Anchor-based Multi-view Subspace Clustering with Hierarchical Feature + Descent + + +
+ Multi-view clustering has attracted growing attention owing to its +capabilities of aggregating information from various sources and its promising +horizons in public affairs. Up till now, many advanced approaches have been +proposed in recent literature. However, there are several ongoing difficulties +to be tackled. One common dilemma occurs while attempting to align the features +of different views. {Moreover, due to the fact that many existing multi-view +clustering algorithms stem from spectral clustering, this results to cubic time +complexity w.r.t. the number of dataset. However, we propose Anchor-based +Multi-view Subspace Clustering with Hierarchical Feature Descent(MVSC-HFD) to +tackle the discrepancy among views through hierarchical feature descent and +project to a common subspace( STAGE 1), which reveals dependency of different +views. We further reduce the computational complexity to linear time cost +through a unified sampling strategy in the common subspace( STAGE 2), followed +by anchor-based subspace clustering to learn the bipartite graph collectively( +STAGE 3). }Extensive experimental results on public benchmark datasets +demonstrate that our proposed model consistently outperforms the +state-of-the-art techniques. + +
+
+
+
+
+ + ♻ ☆ Simple Semantic-Aided Few-Shot Learning CVPR 2024 + + +
+ Learning from a limited amount of data, namely Few-Shot Learning, stands out +as a challenging computer vision task. Several works exploit semantics and +design complicated semantic fusion mechanisms to compensate for rare +representative features within restricted data. However, relying on naive +semantics such as class names introduces biases due to their brevity, while +acquiring extensive semantics from external knowledge takes a huge time and +effort. This limitation severely constrains the potential of semantics in +Few-Shot Learning. In this paper, we design an automatic way called Semantic +Evolution to generate high-quality semantics. The incorporation of high-quality +semantics alleviates the need for complex network structures and learning +algorithms used in previous works. Hence, we employ a simple two-layer network +termed Semantic Alignment Network to transform semantics and visual features +into robust class prototypes with rich discriminative features for few-shot +classification. The experimental results show our framework outperforms all +previous methods on six benchmarks, demonstrating a simple network with +high-quality semantics can beat intricate multi-modal modules on few-shot +classification tasks. Code is available at +https://github.com/zhangdoudou123/SemFew. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Deep Multi-Threshold Spiking-UNet for Image Processing + + +
+ U-Net, known for its simple yet efficient architecture, is widely utilized +for image processing tasks and is particularly suitable for deployment on +neuromorphic chips. This paper introduces the novel concept of Spiking-UNet for +image processing, which combines the power of Spiking Neural Networks (SNNs) +with the U-Net architecture. To achieve an efficient Spiking-UNet, we face two +primary challenges: ensuring high-fidelity information propagation through the +network via spikes and formulating an effective training strategy. To address +the issue of information loss, we introduce multi-threshold spiking neurons, +which improve the efficiency of information transmission within the +Spiking-UNet. For the training strategy, we adopt a conversion and fine-tuning +pipeline that leverage pre-trained U-Net models. During the conversion process, +significant variability in data distribution across different parts is observed +when utilizing skip connections. Therefore, we propose a connection-wise +normalization method to prevent inaccurate firing rates. Furthermore, we adopt +a flow-based training method to fine-tune the converted models, reducing time +steps while preserving performance. Experimental results show that, on image +segmentation and denoising, our Spiking-UNet achieves comparable performance to +its non-spiking counterpart, surpassing existing SNN methods. Compared with the +converted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference +time by approximately 90\%. This research broadens the application scope of +SNNs in image processing and is expected to inspire further exploration in the +field of neuromorphic engineering. The code for our Spiking-UNet implementation +is available at https://github.com/SNNresearch/Spiking-UNet. + +
+
+ comment: Accepted in NeuroComputing +
+
+
+
+
+ + ♻ ☆ PASTA: Towards Flexible and Efficient HDR Imaging Via Progressively + Aggregated Spatio-Temporal Alignment + + +
+ Leveraging Transformer attention has led to great advancements in HDR +deghosting. However, the intricate nature of self-attention introduces +practical challenges, as existing state-of-the-art methods often demand +high-end GPUs or exhibit slow inference speeds, especially for high-resolution +images like 2K. Striking an optimal balance between performance and latency +remains a critical concern. In response, this work presents PASTA, a novel +Progressively Aggregated Spatio-Temporal Alignment framework for HDR +deghosting. Our approach achieves effectiveness and efficiency by harnessing +hierarchical representation during feature distanglement. Through the +utilization of diverse granularities within the hierarchical structure, our +method substantially boosts computational speed and optimizes the HDR imaging +workflow. In addition, we explore within-scale feature modeling with local and +global attention, gradually merging and refining them in a coarse-to-fine +fashion. Experimental results showcase PASTA's superiority over current SOTA +methods in both visual quality and performance metrics, accompanied by a +substantial 3-fold (x3) increase in inference speed. + +
+
+
+
+
+ + ♻ ☆ PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation + + +
+ Beyond class frequency, we recognize the impact of class-wise relationships +among various class-specific predictions and the imbalance in label masks on +long-tailed segmentation learning. To address these challenges, we propose an +innovative Pixel-wise Adaptive Training (PAT) technique tailored for +long-tailed segmentation. PAT has two key features: 1) class-wise gradient +magnitude homogenization, and 2) pixel-wise class-specific loss adaptation +(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate +the imbalance among label masks by ensuring equal consideration of the +class-wise impact on model updates. Second, PCLA tackles the detrimental impact +of both rare classes within the long-tailed distribution and inaccurate +predictions from previous training stages by encouraging learning classes with +low prediction confidence and guarding against forgetting classes with high +confidence. This combined approach fosters robust learning while preventing the +model from forgetting previously learned knowledge. PAT exhibits significant +performance improvements, surpassing the current state-of-the-art by 2.2% in +the NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and +intersection over union value by 2.07%, with a particularly notable declination +of 0.39% in detecting rare classes compared to Balance Logits Variation, as +demonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and +NYU. + +
+
+
+
+
+ + ♻ ☆ Anomaly Score: Evaluating Generative Models and Individual Generated + Images based on Complexity and Vulnerability CVPR 2024 + + +
+ With the advancement of generative models, the assessment of generated images +becomes more and more important. Previous methods measure distances between +features of reference and generated images from trained vision models. In this +paper, we conduct an extensive investigation into the relationship between the +representation space and input space around generated images. We first propose +two measures related to the presence of unnatural elements within images: +complexity, which indicates how non-linear the representation space is, and +vulnerability, which is related to how easily the extracted feature changes by +adversarial input changes. Based on these, we introduce a new metric to +evaluating image-generative models called anomaly score (AS). Moreover, we +propose AS-i (anomaly score for individual images) that can effectively +evaluate generated images individually. Experimental results demonstrate the +validity of the proposed approach. + +
+
+ comment: Accepted in CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Scalable 3D Registration via Truncated Entry-wise Absolute Residuals CVPR 2024 + + +
+ Given an input set of $3$D point pairs, the goal of outlier-robust $3$D +registration is to compute some rotation and translation that align as many +point pairs as possible. This is an important problem in computer vision, for +which many highly accurate approaches have been recently proposed. Despite +their impressive performance, these approaches lack scalability, often +overflowing the $16$GB of memory of a standard laptop to handle roughly +$30,000$ point pairs. In this paper, we propose a $3$D registration approach +that can process more than ten million ($10^7$) point pairs with over $99\%$ +random outliers. Moreover, our method is efficient, entails low memory costs, +and maintains high accuracy at the same time. We call our method TEAR, as it +involves minimizing an outlier-robust loss that computes Truncated Entry-wise +Absolute Residuals. To minimize this loss, we decompose the original +$6$-dimensional problem into two subproblems of dimensions $3$ and $2$, +respectively, solved in succession to global optimality via a customized +branch-and-bound method. While branch-and-bound is often slow and unscalable, +this does not apply to TEAR as we propose novel bounding functions that are +tight and computationally efficient. Experiments on various datasets are +conducted to validate the scalability and efficiency of our method. + +
+
+ comment: 24 pages, 12 figures. Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ CoBra: Complementary Branch Fusing Class and Semantic Knowledge for + Robust Weakly Supervised Semantic Segmentation + + +
+ Leveraging semantically precise pseudo masks derived from image-level class +knowledge for segmentation, namely image-level Weakly Supervised Semantic +Segmentation (WSSS), still remains challenging. While Class Activation Maps +(CAMs) using CNNs have steadily been contributing to the success of WSSS, the +resulting activation maps often narrowly focus on class-specific parts (e.g., +only face of human). On the other hand, recent works based on vision +transformers (ViT) have shown promising results based on their self-attention +mechanism to capture the semantic parts but fail in capturing complete +class-specific details (e.g., entire body parts of human but also with a dog +nearby). In this work, we propose Complementary Branch (CoBra), a novel dual +branch framework consisting of two distinct architectures which provide +valuable complementary knowledge of class (from CNN) and semantic (from ViT) to +each branch. In particular, we learn Class-Aware Projection (CAP) for the CNN +branch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly +fuse their complementary knowledge and facilitate a new type of extra +patch-level supervision. Our model, through CoBra, fuses CNN and ViT's +complementary outputs to create robust pseudo masks that integrate both class +and semantic information effectively. Extensive experiments qualitatively and +quantitatively investigate how CNN and ViT complement each other on the PASCAL +VOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not +only the masks generated by our model, but also the segmentation results +derived from utilizing these masks as pseudo labels. + +
+
+
+
+
+ + ♻ ☆ BIVDiff: A Training-Free Framework for General-Purpose Video Synthesis + via Bridging Image and Video Diffusion Models CVPR 2024 + + +
+ Diffusion models have made tremendous progress in text-driven image and video +generation. Now text-to-image foundation models are widely applied to various +downstream image synthesis tasks, such as controllable image generation and +image editing, while downstream video synthesis tasks are less explored for +several reasons. First, it requires huge memory and computation overhead to +train a video generation foundation model. Even with video foundation models, +additional costly training is still required for downstream video synthesis +tasks. Second, although some works extend image diffusion models into videos in +a training-free manner, temporal consistency cannot be well preserved. Finally, +these adaption methods are specifically designed for one task and fail to +generalize to different tasks. To mitigate these issues, we propose a +training-free general-purpose video synthesis framework, coined as {\bf +BIVDiff}, via bridging specific image diffusion models and general +text-to-video foundation diffusion models. Specifically, we first use a +specific image diffusion model (e.g., ControlNet and Instruct Pix2Pix) for +frame-wise video generation, then perform Mixed Inversion on the generated +video, and finally input the inverted latents into the video diffusion models +(e.g., VidRD and ZeroScope) for temporal smoothing. This decoupled framework +enables flexible image model selection for different purposes with strong task +generalization and high efficiency. To validate the effectiveness and general +use of BIVDiff, we perform a wide range of video synthesis tasks, including +controllable video generation, video editing, video inpainting, and +outpainting. + +
+
+ comment: Accepted by CVPR 2024. Project page: https://bivdiff.github.io; + GitHub repository: https://github.com/MCG-NJU/BIVDiff +
+
+
+
+
+ + ♻ ☆ Empowering Image Recovery_ A Multi-Attention Approach + + +
+ We propose Diverse Restormer (DART), a novel image restoration method that +effectively integrates information from various sources (long sequences, local +and global regions, feature dimensions, and positional dimensions) to address +restoration challenges. While Transformer models have demonstrated excellent +performance in image restoration due to their self-attention mechanism, they +face limitations in complex scenarios. Leveraging recent advancements in +Transformers and various attention mechanisms, our method utilizes customized +attention mechanisms to enhance overall performance. DART, our novel network +architecture, employs windowed attention to mimic the selective focusing +mechanism of human eyes. By dynamically adjusting receptive fields, it +optimally captures the fundamental features crucial for image resolution +reconstruction. Efficiency and performance balance are achieved through the +LongIR attention mechanism for long sequence image restoration. Integration of +attention mechanisms across feature and positional dimensions further enhances +the recovery of fine details. Evaluation across five restoration tasks +consistently positions DART at the forefront. Upon acceptance, we commit to +providing publicly accessible code and models to ensure reproducibility and +facilitate further research. + +
+
+ comment: 12 pages, 10 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ TriSAM: Tri-Plane SAM for zero-shot cortical blood vessel segmentation + in VEM images + + +
+ While imaging techniques at macro and mesoscales have garnered substantial +attention and resources, microscale VEM imaging, capable of revealing intricate +vascular details, has lacked the necessary benchmarking infrastructure. In this +paper, we address a significant gap in the field of neuroimaging by introducing +the largest-to-date public benchmark, \textbf{BvEM}, designed specifically for +cortical blood vessel segmentation in volume electron microscopy (VEM) images. +Our BvEM benchmark is based on VEM image volumes from three mammal species: +adult mouse, macaque, and human. We standardized the resolution, addressed +imaging variations, and meticulously annotated blood vessels through +semi-automatic, manual, and quality control processes, ensuring high-quality 3D +segmentation. Furthermore, we developed a zero-shot cortical blood vessel +segmentation method named TriSAM, which leverages the powerful segmentation +model SAM for 3D segmentation. To extend SAM from 2D to 3D volume segmentation, +TriSAM employs a multi-seed tracking framework, leveraging the reliability of +certain image planes for tracking while using others to identify potential +turning points. This approach effectively achieves long-term 3D blood vessel +segmentation without model training or fine-tuning. Experimental results show +that TriSAM achieved superior performances on the BvEM benchmark across three +species. + +
+
+ comment: BvEM-Mouse can be visualized at: https://tinyurl.com/yc2s38x9 +
+
+
+
+
+ + ♻ ☆ GeRM: A Generalist Robotic Model with Mixture-of-experts for Quadruped + Robot + + +
+ Multi-task robot learning holds significant importance in tackling diverse +and complex scenarios. However, current approaches are hindered by performance +issues and difficulties in collecting training datasets. In this paper, we +propose GeRM (Generalist Robotic Model). We utilize offline reinforcement +learning to optimize data utilization strategies to learn from both +demonstrations and sub-optimal data, thus surpassing the limitations of human +demonstrations. Thereafter, we employ a transformer-based VLA network to +process multi-modal inputs and output actions. By introducing the +Mixture-of-Experts structure, GeRM allows faster inference speed with higher +whole model capacity, and thus resolves the issue of limited RL parameters, +enhancing model performance in multi-task learning while controlling +computational costs. Through a series of experiments, we demonstrate that GeRM +outperforms other methods across all tasks, while also validating its +efficiency in both training and inference processes. Additionally, we uncover +its potential to acquire emergent skills. Additionally, we contribute the +QUARD-Auto dataset, collected automatically to support our training approach +and foster advancements in multi-task quadruped robot learning. This work +presents a new paradigm for reducing the cost of collecting robot data and +driving progress in the multi-task learning community. You can reach our +project and video through the link: https://songwxuan.github.io/GeRM/ . + +
+
+
+
+
+ + ♻ ☆ Exploring Recurrent Long-term Temporal Fusion for Multi-view 3D + Perception + + +
+ Long-term temporal fusion is a crucial but often overlooked technique in +camera-based Bird's-Eye-View (BEV) 3D perception. Existing methods are mostly +in a parallel manner. While parallel fusion can benefit from long-term +information, it suffers from increasing computational and memory overheads as +the fusion window size grows. Alternatively, BEVFormer adopts a recurrent +fusion pipeline so that history information can be efficiently integrated, yet +it fails to benefit from longer temporal frames. In this paper, we explore an +embarrassingly simple long-term recurrent fusion strategy built upon the +LSS-based methods and find it already able to enjoy the merits from both sides, +i.e., rich long-term information and efficient fusion pipeline. A temporal +embedding module is further proposed to improve the model's robustness against +occasionally missed frames in practical scenarios. We name this simple but +effective fusing pipeline VideoBEV. Experimental results on the nuScenes +benchmark show that VideoBEV obtains strong performance on various camera-based +3D perception tasks, including object detection (55.4\% mAP and 62.9\% NDS), +segmentation (48.6\% vehicle mIoU), tracking (54.8\% AMOTA), and motion +prediction (0.80m minADE and 0.463 EPA). + +
+
+
+
+
+ + ♻ ☆ Ranni: Taming Text-to-Image Diffusion for Accurate Instruction Following + + +
+ Existing text-to-image (T2I) diffusion models usually struggle in +interpreting complex prompts, especially those with quantity, object-attribute +binding, and multi-subject descriptions. In this work, we introduce a semantic +panel as the middleware in decoding texts to images, supporting the generator +to better follow instructions. The panel is obtained through arranging the +visual concepts parsed from the input text by the aid of large language models, +and then injected into the denoising network as a detailed control signal to +complement the text condition. To facilitate text-to-panel learning, we come up +with a carefully designed semantic formatting protocol, accompanied by a +fully-automatic data preparation pipeline. Thanks to such a design, our +approach, which we call Ranni, manages to enhance a pre-trained T2I generator +regarding its textual controllability. More importantly, the introduction of +the generative middleware brings a more convenient form of interaction (i.e., +directly adjusting the elements in the panel or using language instructions) +and further allows users to finely customize their generation, based on which +we develop a practical system and showcase its potential in continuous +generation and chatting-based editing. Our project page is at +https://ranni-t2i.github.io/Ranni. + +
+
+
+
+
+ + ♻ ☆ TIM: A Time Interval Machine for Audio-Visual Action Recognition CVPR 2024 + + +
+ Diverse actions give rise to rich audio-visual signals in long videos. Recent +works showcase that the two modalities of audio and video exhibit different +temporal extents of events and distinct labels. We address the interplay +between the two modalities in long videos by explicitly modelling the temporal +extents of audio and visual events. We propose the Time Interval Machine (TIM) +where a modality-specific time interval poses as a query to a transformer +encoder that ingests a long video input. The encoder then attends to the +specified interval, as well as the surrounding context in both modalities, in +order to recognise the ongoing action. + We test TIM on three long audio-visual video datasets: EPIC-KITCHENS, +Perception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On +EPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly +larger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we +show that TIM can be adapted for action detection, using dense multi-scale +interval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and +showing strong performance on the Perception Test. Our ablations show the +critical role of integrating the two modalities and modelling their time +intervals in achieving this performance. Code and models at: +https://github.com/JacobChalk/TIM + +
+
+ comment: Accepted to CVPR 2024. Project Webpage: + https://jacobchalk.github.io/TIM-Project +
+
+
+
+
+ + ♻ ☆ BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics + + +
+ The recently emerging text-to-motion advances have spired numerous attempts +for convenient and interactive human motion generation. Yet, existing methods +are largely limited to generating body motions only without considering the +rich two-hand motions, let alone handling various conditions like body dynamics +or texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal +dataset for two-hand motion generation. Our dataset includes accurate motion +tracking for the human body and hands and provides pair-wised finger-level hand +annotations and body descriptions. We further provide a strong baseline method, +BOTH2Hands, for the novel task: generating vivid two-hand motions from both +implicit body dynamics and explicit text prompts. We first warm up two parallel +body-to-hand and text-to-hand diffusion models and then utilize the +cross-attention transformer for motion blending. Extensive experiments and +cross-validations demonstrate the effectiveness of our approach and dataset for +generating convincing two-hand motions from the hybrid body-and-textual +conditions. Our dataset and code will be disseminated to the community for +future research. + +
+
+
+
+
+ + ♻ ☆ Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and + Integration of Convolutional Neural Networks and Explainable AI + + +
+ The study introduces an integrated framework combining Convolutional Neural +Networks (CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced +diagnosis of breast cancer using the CBIS-DDSM dataset. Utilizing a fine-tuned +ResNet50 architecture, our investigation not only provides effective +differentiation of mammographic images into benign and malignant categories but +also addresses the opaque "black-box" nature of deep learning models by +employing XAI methodologies, namely Grad-CAM, LIME, and SHAP, to interpret CNN +decision-making processes for healthcare professionals. Our methodology +encompasses an elaborate data preprocessing pipeline and advanced data +augmentation techniques to counteract dataset limitations, and transfer +learning using pre-trained networks, such as VGG-16, DenseNet and ResNet was +employed. A focal point of our study is the evaluation of XAI's effectiveness +in interpreting model predictions, highlighted by utilising the Hausdorff +measure to assess the alignment between AI-generated explanations and expert +annotations quantitatively. This approach plays a critical role for XAI in +promoting trustworthiness and ethical fairness in AI-assisted diagnostics. The +findings from our research illustrate the effective collaboration between CNNs +and XAI in advancing diagnostic methods for breast cancer, thereby facilitating +a more seamless integration of advanced AI technologies within clinical +settings. By enhancing the interpretability of AI-driven decisions, this work +lays the groundwork for improved collaboration between AI systems and medical +practitioners, ultimately enriching patient care. Furthermore, the implications +of our research extend well beyond the current methodologies, advocating for +subsequent inquiries into the integration of multimodal data and the refinement +of AI explanations to satisfy the needs of clinical practice. + +
+
+
+
+
+ + ♻ ☆ Learning Invariant Inter-pixel Correlations for Superpixel Generation + + +
+ Deep superpixel algorithms have made remarkable strides by substituting +hand-crafted features with learnable ones. Nevertheless, we observe that +existing deep superpixel methods, serving as mid-level representation +operations, remain sensitive to the statistical properties (e.g., color +distribution, high-level semantics) embedded within the training dataset. +Consequently, learnable features exhibit constrained discriminative capability, +resulting in unsatisfactory pixel grouping performance, particularly in +untrainable application scenarios. To address this issue, we propose the +Content Disentangle Superpixel (CDS) algorithm to selectively separate the +invariant inter-pixel correlations and statistical properties, i.e., style +noise. Specifically, We first construct auxiliary modalities that are +homologous to the original RGB image but have substantial stylistic variations. +Then, driven by mutual information, we propose the local-grid correlation +alignment across modalities to reduce the distribution discrepancy of +adaptively selected features and learn invariant inter-pixel correlations. +Afterwards, we perform global-style mutual information minimization to enforce +the separation of invariant content and train data styles. The experimental +results on four benchmark datasets demonstrate the superiority of our approach +to existing state-of-the-art methods, regarding boundary adherence, +generalization, and efficiency. Code and pre-trained model are available at +https://github.com/rookiie/CDSpixel. + +
+
+ comment: Accepted by AAAI24 +
+
+
+
+
+ + ♻ ☆ SDFR: Synthetic Data for Face Recognition Competition + + +
+ Large-scale face recognition datasets are collected by crawling the Internet +and without individuals' consent, raising legal, ethical, and privacy concerns. +With the recent advances in generative models, recently several works proposed +generating synthetic face recognition datasets to mitigate concerns in +web-crawled face recognition datasets. This paper presents the summary of the +Synthetic Data for Face Recognition (SDFR) Competition held in conjunction with +the 18th IEEE International Conference on Automatic Face and Gesture +Recognition (FG 2024) and established to investigate the use of synthetic data +for training face recognition models. The SDFR competition was split into two +tasks, allowing participants to train face recognition systems using new +synthetic datasets and/or existing ones. In the first task, the face +recognition backbone was fixed and the dataset size was limited, while the +second task provided almost complete freedom on the model backbone, the +dataset, and the training pipeline. The submitted models were trained on +existing and also new synthetic datasets and used clever methods to improve +training with synthetic data. The submissions were evaluated and ranked on a +diverse set of seven benchmarking datasets. The paper gives an overview of the +submitted face recognition models and reports achieved performance compared to +baseline models trained on real and synthetic datasets. Furthermore, the +evaluation of submissions is extended to bias assessment across different +demography groups. Lastly, an outlook on the current state of the research in +training face recognition models using synthetic data is presented, and +existing problems as well as potential future directions are also discussed. + +
+
+ comment: The 18th IEEE International Conference on Automatic Face and Gesture + Recognition (FG 2024) +
+
+
+
+
+ + ♻ ☆ PhysAvatar: Learning the Physics of Dressed 3D Avatars from Visual + Observations + + +
+ Modeling and rendering photorealistic avatars is of crucial importance in +many applications. Existing methods that build a 3D avatar from visual +observations, however, struggle to reconstruct clothed humans. We introduce +PhysAvatar, a novel framework that combines inverse rendering with inverse +physics to automatically estimate the shape and appearance of a human from +multi-view video data along with the physical parameters of the fabric of their +clothes. For this purpose, we adopt a mesh-aligned 4D Gaussian technique for +spatio-temporal mesh tracking as well as a physically based inverse renderer to +estimate the intrinsic material properties. PhysAvatar integrates a physics +simulator to estimate the physical parameters of the garments using +gradient-based optimization in a principled manner. These novel capabilities +enable PhysAvatar to create high-quality novel-view renderings of avatars +dressed in loose-fitting clothes under motions and lighting conditions not seen +in the training data. This marks a significant advancement towards modeling +photorealistic digital humans using physically based inverse rendering with +physics in the loop. Our project website is at: +https://qingqing-zhao.github.io/PhysAvatar + +
+
+ comment: Project Page: https://qingqing-zhao.github.io/PhysAvatar +
+
+
+
+
+ + ♻ ☆ Dense Video Object Captioning from Disjoint Supervision + + +
+ We propose a new task and model for dense video object captioning -- +detecting, tracking and captioning trajectories of objects in a video. This +task unifies spatial and temporal localization in video, whilst also requiring +fine-grained visual understanding that is best described by natural language. +We propose a unified model, and demonstrate how our end-to-end approach is more +accurate and temporally coherent than a multi-stage pipeline combining +state-of-the-art detection, tracking, and captioning models. Moreover, we +propose a training strategy based on a mixture of disjoint tasks, which allows +us to leverage diverse, large-scale datasets which supervise different parts of +our model. Although each pretraining task only provides weak supervision, they +are complementary and, when combined, result in noteworthy zero-shot ability +and serve as strong initialization for additional finetuning to further improve +accuracy. We carefully design new metrics capturing all components of our task, +and show how we can repurpose existing video grounding datasets (e.g. VidSTG +and VLN) for our new task. We show that our model improves upon a number of +strong baselines for this new task. Furthermore, we can apply our model to the +task of spatial grounding, outperforming prior state-of-the-art on VidSTG and +VLN, without explicitly training for it. Code is available at +https://github.com/google-research/scenic/tree/main/scenic/projects/densevoc. + +
+
+ comment: Code is available at + https://github.com/google-research/scenic/tree/main/scenic/projects/densevoc +
+
+
+
+
+ + ♻ ☆ Oriented Object Detection in Optical Remote Sensing Images using Deep + Learning: A Survey + + +
+ Oriented object detection is one of the most fundamental and challenging +tasks in remote sensing, aiming to locate and classify objects with arbitrary +orientations. Recent years have witnessed remarkable progress in oriented +object detection using deep learning techniques. Given the rapid development of +this field, this paper aims to provide a comprehensive survey of recent +advances in oriented object detection. To be specific, we first review the +technical evolution from horizontal object detection to oriented object +detection and summarize the specific challenges, including feature +misalignment, spatial misalignment, and periodicity of angle. Subsequently, we +further categorize existing methods into detection framework, oriented bounding +box (OBB) regression, and feature representations, and discuss how these +methods address the above challenges in detail. In addition, we cover several +publicly available datasets and performance evaluation protocols. Furthermore, +we provide a comprehensive comparison and analysis of state-of-the-art oriented +object detection methods. Toward the end of this paper, we discuss several +future directions for oriented object detection. + +
+
+
+
+
+ + ♻ ☆ PeerAiD: Improving Adversarial Distillation from a Specialized Peer + Tutor CVPR 2024 + + +
+ Adversarial robustness of the neural network is a significant concern when it +is applied to security-critical domains. In this situation, adversarial +distillation is a promising option which aims to distill the robustness of the +teacher network to improve the robustness of a small student network. Previous +works pretrain the teacher network to make it robust to the adversarial +examples aimed at itself. However, the adversarial examples are dependent on +the parameters of the target network. The fixed teacher network inevitably +degrades its robustness against the unseen transferred adversarial examples +which targets the parameters of the student network in the adversarial +distillation process. We propose PeerAiD to make a peer network learn the +adversarial examples of the student network instead of adversarial examples +aimed at itself. PeerAiD is an adversarial distillation that trains the peer +network and the student network simultaneously in order to make the peer +network specialized for defending the student network. We observe that such +peer networks surpass the robustness of pretrained robust teacher network +against student-attacked adversarial samples. With this peer network and +adversarial distillation, PeerAiD achieves significantly higher robustness of +the student network with AutoAttack (AA) accuracy up to 1.66%p and improves the +natural accuracy of the student network up to 4.72%p with ResNet-18 and +TinyImageNet dataset. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Object Detectors in the Open Environment: Challenges, Solutions, and + Outlook + + +
+ With the emergence of foundation models, deep learning-based object detectors +have shown practical usability in closed set scenarios. However, for real-world +tasks, object detectors often operate in open environments, where crucial +factors (e.g., data distribution, objective) that influence model learning are +often changing. The dynamic and intricate nature of the open environment poses +novel and formidable challenges to object detectors. Unfortunately, current +research on object detectors in open environments lacks a comprehensive +analysis of their distinctive characteristics, challenges, and corresponding +solutions, which hinders their secure deployment in critical real-world +scenarios. This paper aims to bridge this gap by conducting a comprehensive +review and analysis of object detectors in open environments. We initially +identified limitations of key structural components within the existing +detection pipeline and propose the open environment object detector challenge +framework that includes four quadrants (i.e., out-of-domain, out-of-category, +robust learning, and incremental learning) based on the dimensions of the data +/ target changes. For each quadrant of challenges in the proposed framework, we +present a detailed description and systematic analysis of the overarching goals +and core difficulties, systematically review the corresponding solutions, and +benchmark their performance over multiple widely adopted datasets. In addition, +we engage in a discussion of open problems and potential avenues for future +research. This paper aims to provide a fresh, comprehensive, and systematic +understanding of the challenges and solutions associated with open-environment +object detectors, thus catalyzing the development of more solid applications in +real-world scenarios. A project related to this survey can be found at +https://github.com/LiangSiyuan21/OEOD_Survey. + +
+
+ comment: 37 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ Carve3D: Improving Multi-view Reconstruction Consistency for Diffusion + Models with RL Finetuning CVPR 2024 + + +
+ Multi-view diffusion models, obtained by applying Supervised Finetuning (SFT) +to text-to-image diffusion models, have driven recent breakthroughs in +text-to-3D research. However, due to the limited size and quality of existing +3D datasets, they still suffer from multi-view inconsistencies and Neural +Radiance Field (NeRF) reconstruction artifacts. We argue that multi-view +diffusion models can benefit from further Reinforcement Learning Finetuning +(RLFT), which allows models to learn from the data generated by themselves and +improve beyond their dataset limitations during SFT. To this end, we introduce +Carve3D, an improved RLFT algorithm coupled with a novel Multi-view +Reconstruction Consistency (MRC) metric, to enhance the consistency of +multi-view diffusion models. To measure the MRC metric on a set of multi-view +images, we compare them with their corresponding NeRF renderings at the same +camera viewpoints. The resulting model, which we denote as Carve3DM, +demonstrates superior multi-view consistency and NeRF reconstruction quality +than existing models. Our results suggest that pairing SFT with Carve3D's RLFT +is essential for developing multi-view-consistent diffusion models, mirroring +the standard Large Language Model (LLM) alignment pipeline. Our code, training +and testing data, and video results are available at: +https://desaixie.github.io/carve-3d. + +
+
+ comment: 22 pages, 16 figures. Our code, training and testing data, and video + results are available at: https://desaixie.github.io/carve-3d. This paper has + been accepted to CVPR 2024. v2: incorporated changes from the CVPR 2024 + camera-ready version +
+
+
+
+
+ + ♻ ☆ Surface Reconstruction from Point Clouds via Grid-based Intersection + Prediction + + +
+ Surface reconstruction from point clouds is a crucial task in the fields of +computer vision and computer graphics. SDF-based methods excel at +reconstructing smooth meshes with minimal error and artefacts but struggle with +representing open surfaces. On the other hand, UDF-based methods can +effectively represent open surfaces but often introduce noise, leading to +artefacts in the mesh. In this work, we propose a novel approach that directly +predicts the intersection points between line segment of point pairs and +implicit surfaces. To achieve it, we propose two modules named Relative +Intersection Module and Sign Module respectively with the feature of point pair +as input. To preserve the continuity of the surface, we also integrate symmetry +into the two modules, which means the position of predicted intersection will +not change even if the input order of the point pair changes. This method not +only preserves the ability to represent open surfaces but also eliminates most +artefacts on the mesh. Our approach demonstrates state-of-the-art performance +on three datasets: ShapeNet, MGN, and ScanNet. The code will be made available +upon acceptance. + +
+
+
+
+
+ + ♻ ☆ Background Noise Reduction of Attention Map for Weakly Supervised + Semantic Segmentation + + +
+ In weakly-supervised semantic segmentation (WSSS) using only image-level +class labels, a problem with CNN-based Class Activation Maps (CAM) is that they +tend to activate the most discriminative local regions of objects. On the other +hand, methods based on Transformers learn global features but suffer from the +issue of background noise contamination. This paper focuses on addressing the +issue of background noise in attention weights within the existing WSSS method +based on Conformer, known as TransCAM. The proposed method successfully reduces +background noise, leading to improved accuracy of pseudo labels. Experimental +results demonstrate that our model achieves segmentation performance of 70.5% +on the PASCAL VOC 2012 validation data, 71.1% on the test data, and 45.9% on MS +COCO 2014 data, outperforming TransCAM in terms of segmentation performance. + +
+
+
+
+
+ + ♻ ☆ Improving the Accuracy-Robustness Trade-Off of Classifiers via Adaptive + Smoothing + + +
+ While prior research has proposed a plethora of methods that build neural +classifiers robust against adversarial robustness, practitioners are still +reluctant to adopt them due to their unacceptably severe clean accuracy +penalties. This paper significantly alleviates this accuracy-robustness +trade-off by mixing the output probabilities of a standard classifier and a +robust classifier, where the standard network is optimized for clean accuracy +and is not robust in general. We show that the robust base classifier's +confidence difference for correct and incorrect examples is the key to this +improvement. In addition to providing intuitions and empirical evidence, we +theoretically certify the robustness of the mixed classifier under realistic +assumptions. Furthermore, we adapt an adversarial input detector into a mixing +network that adaptively adjusts the mixture of the two base models, further +reducing the accuracy penalty of achieving robustness. The proposed flexible +method, termed "adaptive smoothing", can work in conjunction with existing or +even future methods that improve clean accuracy, robustness, or adversary +detection. Our empirical evaluation considers strong attack methods, including +AutoAttack and adaptive attack. On the CIFAR-100 dataset, our method achieves +an 85.21% clean accuracy while maintaining a 38.72% $\ell_\infty$-AutoAttacked +($\epsilon = 8/255$) accuracy, becoming the second most robust method on the +RobustBench CIFAR-100 benchmark as of submission, while improving the clean +accuracy by ten percentage points compared with all listed models. The code +that implements our method is available at +https://github.com/Bai-YT/AdaptiveSmoothing. + +
+
+
+
+
+ + ♻ ☆ SIR: Multi-view Inverse Rendering with Decomposable Shadow for Indoor + Scenes + + +
+ We propose SIR, an efficient method to decompose differentiable shadows for +inverse rendering on indoor scenes using multi-view data, addressing the +challenges in accurately decomposing the materials and lighting conditions. +Unlike previous methods that struggle with shadow fidelity in complex lighting +environments, our approach explicitly learns shadows for enhanced realism in +material estimation under unknown light positions. Utilizing posed HDR images +as input, SIR employs an SDF-based neural radiance field for comprehensive +scene representation. Then, SIR integrates a shadow term with a three-stage +material estimation approach to improve SVBRDF quality. Specifically, SIR is +designed to learn a differentiable shadow, complemented by BRDF regularization, +to optimize inverse rendering accuracy. Extensive experiments on both synthetic +and real-world indoor scenes demonstrate the superior performance of SIR over +existing methods in both quantitative metrics and qualitative analysis. The +significant decomposing ability of SIR enables sophisticated editing +capabilities like free-view relighting, object insertion, and material +replacement. The code and data are available at +https://xiaokangwei.github.io/SIR/. + +
+
+
+
+
+ + ♻ ☆ Toward Tiny and High-quality Facial Makeup with Data Amplify Learning + + +
+ Contemporary makeup approaches primarily hinge on unpaired learning +paradigms, yet they grapple with the challenges of inaccurate supervision +(e.g., face misalignment) and sophisticated facial prompts (including face +parsing, and landmark detection). These challenges prohibit low-cost deployment +of facial makeup models, especially on mobile devices. To solve above problems, +we propose a brand-new learning paradigm, termed "Data Amplify Learning (DAL)," +alongside a compact makeup model named "TinyBeauty." The core idea of DAL lies +in employing a Diffusion-based Data Amplifier (DDA) to "amplify" limited images +for the model training, thereby enabling accurate pixel-to-pixel supervision +with merely a handful of annotations. Two pivotal innovations in DDA facilitate +the above training approach: (1) A Residual Diffusion Model (RDM) is designed +to generate high-fidelity detail and circumvent the detail vanishing problem in +the vanilla diffusion models; (2) A Fine-Grained Makeup Module (FGMM) is +proposed to achieve precise makeup control and combination while retaining face +identity. Coupled with DAL, TinyBeauty necessitates merely 80K parameters to +achieve a state-of-the-art performance without intricate face prompts. +Meanwhile, TinyBeauty achieves a remarkable inference speed of up to 460 fps on +the iPhone 13. Extensive experiments show that DAL can produce highly +competitive makeup models using only 5 image pairs. + +
+
+
+
+
+ + ♻ ☆ Harnessing Meta-Learning for Improving Full-Frame Video Stabilization CVPR 2024 + + +
+ Video stabilization is a longstanding computer vision problem, particularly +pixel-level synthesis solutions for video stabilization which synthesize full +frames add to the complexity of this task. These techniques aim to stabilize +videos by synthesizing full frames while enhancing the stability of the +considered video. This intensifies the complexity of the task due to the +distinct mix of unique motion profiles and visual content present in each video +sequence, making robust generalization with fixed parameters difficult. In our +study, we introduce a novel approach to enhance the performance of pixel-level +synthesis solutions for video stabilization by adapting these models to +individual input video sequences. The proposed adaptation exploits low-level +visual cues accessible during test-time to improve both the stability and +quality of resulting videos. We highlight the efficacy of our methodology of +"test-time adaptation" through simple fine-tuning of one of these models, +followed by significant stability gain via the integration of meta-learning +techniques. Notably, significant improvement is achieved with only a single +adaptation step. The versatility of the proposed algorithm is demonstrated by +consistently improving the performance of various pixel-level synthesis models +for video stabilization in real-world scenarios. + +
+
+ comment: CVPR 2024, Code will be made availble on: + http://github.com/MKashifAli/MetaVideoStab +
+
+
+
+
+ + ♻ ☆ Detecting and Mitigating System-Level Anomalies of Vision-Based + Controllers + + +
+ Autonomous systems, such as self-driving cars and drones, have made +significant strides in recent years by leveraging visual inputs and machine +learning for decision-making and control. Despite their impressive performance, +these vision-based controllers can make erroneous predictions when faced with +novel or out-of-distribution inputs. Such errors can cascade to catastrophic +system failures and compromise system safety. In this work, we introduce a +run-time anomaly monitor to detect and mitigate such closed-loop, system-level +failures. Specifically, we leverage a reachability-based framework to +stress-test the vision-based controller offline and mine its system-level +failures. This data is then used to train a classifier that is leveraged online +to flag inputs that might cause system breakdowns. The anomaly detector +highlights issues that transcend individual modules and pertain to the safety +of the overall system. We also design a fallback controller that robustly +handles these detected anomalies to preserve system safety. We validate the +proposed approach on an autonomous aircraft taxiing system that uses a +vision-based controller for taxiing. Our results show the efficacy of the +proposed approach in identifying and handling system-level anomalies, +outperforming methods such as prediction error-based detection, and ensembling, +thereby enhancing the overall safety and robustness of autonomous systems. + +
+
+
+
+
+ + ♻ ☆ Rich Human Feedback for Text-to-Image Generation CVPR'24 + + +
+ Recent Text-to-Image (T2I) generation models such as Stable Diffusion and +Imagen have made significant progress in generating high-resolution images +based on text descriptions. However, many generated images still suffer from +issues such as artifacts/implausibility, misalignment with text descriptions, +and low aesthetic quality. Inspired by the success of Reinforcement Learning +with Human Feedback (RLHF) for large language models, prior works collected +human-provided scores as feedback on generated images and trained a reward +model to improve the T2I generation. In this paper, we enrich the feedback +signal by (i) marking image regions that are implausible or misaligned with the +text, and (ii) annotating which words in the text prompt are misrepresented or +missing on the image. We collect such rich human feedback on 18K generated +images (RichHF-18K) and train a multimodal transformer to predict the rich +feedback automatically. We show that the predicted rich human feedback can be +leveraged to improve image generation, for example, by selecting high-quality +training data to finetune and improve the generative models, or by creating +masks with predicted heatmaps to inpaint the problematic regions. Notably, the +improvements generalize to models (Muse) beyond those used to generate the +images on which human feedback data were collected (Stable Diffusion variants). +The RichHF-18K data set will be released in our GitHub repository: +https://github.com/google-research/google-research/tree/master/richhf_18k. + +
+
+ comment: CVPR'24 +
+
+
+
+
+ + ♻ ☆ Feature Re-Embedding: Towards Foundation Model-Level Performance in + Computational Pathology CVPR2024 + + +
+ Multiple instance learning (MIL) is the most widely used framework in +computational pathology, encompassing sub-typing, diagnosis, prognosis, and +more. However, the existing MIL paradigm typically requires an offline instance +feature extractor, such as a pre-trained ResNet or a foundation model. This +approach lacks the capability for feature fine-tuning within the specific +downstream tasks, limiting its adaptability and performance. To address this +issue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding +the instance features online, which captures fine-grained local features and +establishes connections across different regions. Unlike existing works that +focus on pre-training powerful feature extractor or designing sophisticated +instance aggregator, R$^2$T is tailored to re-embed instance features online. +It serves as a portable module that can seamlessly integrate into mainstream +MIL models. Extensive experimental results on common computational pathology +tasks validate that: 1) feature re-embedding improves the performance of MIL +models based on ResNet-50 features to the level of foundation model features, +and further enhances the performance of foundation model features; 2) the +R$^2$T can introduce more significant performance improvements to various MIL +models; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest +methods by a large margin.The code is available at: +https://github.com/DearCaat/RRT-MIL. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Full-dose Whole-body PET Synthesis from Low-dose PET Using + High-efficiency Denoising Diffusion Probabilistic Model: PET Consistency + Model + + +
+ Objective: Positron Emission Tomography (PET) has been a commonly used +imaging modality in broad clinical applications. One of the most important +tradeoffs in PET imaging is between image quality and radiation dose: high +image quality comes with high radiation exposure. Improving image quality is +desirable for all clinical applications while minimizing radiation exposure is +needed to reduce risk to patients. Approach: We introduce PET Consistency Model +(PET-CM), an efficient diffusion-based method for generating high-quality +full-dose PET images from low-dose PET images. It employs a two-step process, +adding Gaussian noise to full-dose PET images in the forward diffusion, and +then denoising them using a PET Shifted-window Vision Transformer (PET-VIT) +network in the reverse diffusion. The PET-VIT network learns a consistency +function that enables direct denoising of Gaussian noise into clean full-dose +PET images. PET-CM achieves state-of-the-art image quality while requiring +significantly less computation time than other methods. Results: In experiments +comparing eighth-dose to full-dose images, PET-CM demonstrated impressive +performance with NMAE of 1.278+/-0.122%, PSNR of 33.783+/-0.824dB, SSIM of +0.964+/-0.009, NCC of 0.968+/-0.011, HRS of 4.543, and SUV Error of +0.255+/-0.318%, with an average generation time of 62 seconds per patient. This +is a significant improvement compared to the state-of-the-art diffusion-based +model with PET-CM reaching this result 12x faster. Similarly, in the +quarter-dose to full-dose image experiments, PET-CM delivered competitive +outcomes, achieving an NMAE of 0.973+/-0.066%, PSNR of 36.172+/-0.801dB, SSIM +of 0.984+/-0.004, NCC of 0.990+/-0.005, HRS of 4.428, and SUV Error of +0.151+/-0.192% using the same generation process, which underlining its high +quantitative and clinical precision in both denoising scenario. + +
+
+
+
+
+ + ♻ ☆ SocialCounterfactuals: Probing and Mitigating Intersectional Social + Biases in Vision-Language Models with Counterfactual Examples CVPR 2024 + + +
+ While vision-language models (VLMs) have achieved remarkable performance +improvements recently, there is growing evidence that these models also posses +harmful biases with respect to social attributes such as gender and race. Prior +studies have primarily focused on probing such bias attributes individually +while ignoring biases associated with intersections between social attributes. +This could be due to the difficulty of collecting an exhaustive set of +image-text pairs for various combinations of social attributes. To address this +challenge, we employ text-to-image diffusion models to produce counterfactual +examples for probing intersectional social biases at scale. Our approach +utilizes Stable Diffusion with cross attention control to produce sets of +counterfactual image-text pairs that are highly similar in their depiction of a +subject (e.g., a given occupation) while differing only in their depiction of +intersectional social attributes (e.g., race & gender). Through our +over-generate-then-filter methodology, we produce SocialCounterfactuals, a +high-quality dataset containing 171k image-text pairs for probing +intersectional biases related to gender, race, and physical characteristics. We +conduct extensive experiments to demonstrate the usefulness of our generated +dataset for probing and mitigating intersectional social biases in +state-of-the-art VLMs. + +
+
+ comment: Accepted to CVPR 2024. arXiv admin note: text overlap with + arXiv:2310.02988 +
+
+
+
+
+ + ♻ ☆ Better Monocular 3D Detectors with LiDAR from the Past ICRA 2024 + + +
+ Accurate 3D object detection is crucial to autonomous driving. Though +LiDAR-based detectors have achieved impressive performance, the high cost of +LiDAR sensors precludes their widespread adoption in affordable vehicles. +Camera-based detectors are cheaper alternatives but often suffer inferior +performance compared to their LiDAR-based counterparts due to inherent depth +ambiguities in images. In this work, we seek to improve monocular 3D detectors +by leveraging unlabeled historical LiDAR data. Specifically, at inference time, +we assume that the camera-based detectors have access to multiple unlabeled +LiDAR scans from past traversals at locations of interest (potentially from +other high-end vehicles equipped with LiDAR sensors). Under this setup, we +proposed a novel, simple, and end-to-end trainable framework, termed +AsyncDepth, to effectively extract relevant features from asynchronous LiDAR +traversals of the same location for monocular 3D detectors. We show consistent +and significant performance gain (up to 9 AP) across multiple state-of-the-art +models and datasets with a negligible additional latency of 9.66 ms and a small +storage cost. + +
+
+ comment: Accepted by ICRA 2024. The code can be found at + https://github.com/YurongYou/AsyncDepth +
+
+
+
+
+ + ♻ ☆ $λ$-ECLIPSE: Multi-Concept Personalized Text-to-Image Diffusion + Models by Leveraging CLIP Latent Space + + +
+ Despite the recent advances in personalized text-to-image (P-T2I) generative +models, it remains challenging to perform finetuning-free multi-subject-driven +T2I in a resource-efficient manner. Predominantly, contemporary approaches, +involving the training of Hypernetworks and Multimodal Large Language Models +(MLLMs), require heavy computing resources that range from 600 to 12300 GPU +hours of training. These subject-driven T2I methods hinge on Latent Diffusion +Models (LDMs), which facilitate T2I mapping through cross-attention layers. +While LDMs offer distinct advantages, P-T2I methods' reliance on the latent +space of these diffusion models significantly escalates resource demands, +leading to inconsistent results and necessitating numerous iterations for a +single desired image. In this paper, we present $\lambda$-ECLIPSE, an +alternative prior-training strategy that works in the latent space of a +pre-trained CLIP model without relying on the diffusion UNet models. +$\lambda$-ECLIPSE leverages the image-text interleaved pre-training for fast +and effective multi-subject-driven P-T2I. Through extensive experiments, we +establish that $\lambda$-ECLIPSE surpasses existing baselines in composition +alignment while preserving concept alignment performance, even with +significantly lower resource utilization. $\lambda$-ECLIPSE performs +multi-subject driven P-T2I with just 34M parameters and is trained on a mere 74 +GPU hours. Additionally, $\lambda$-ECLIPSE demonstrates the unique ability to +perform multi-concept interpolations. + +
+
+ comment: Project page: https://eclipse-t2i.github.io/Lambda-ECLIPSE/ +
+
+
+
+
+ + ♻ ☆ Quilt-LLaVA: Visual Instruction Tuning by Extracting Localized + Narratives from Open-Source Histopathology Videos + + +
+ Diagnosis in histopathology requires a global whole slide images (WSIs) +analysis, requiring pathologists to compound evidence from different WSI +patches. The gigapixel scale of WSIs poses a challenge for histopathology +multi-modal models. Training multi-model models for histopathology requires +instruction tuning datasets, which currently contain information for individual +image patches, without a spatial grounding of the concepts within each patch +and without a wider view of the WSI. Therefore, they lack sufficient diagnostic +capacity for histopathology. To bridge this gap, we introduce Quilt-Instruct, a +large-scale dataset of 107,131 histopathology-specific instruction +question/answer pairs, grounded within diagnostically relevant image patches +that make up the WSI. Our dataset is collected by leveraging educational +histopathology videos from YouTube, which provides spatial localization of +narrations by automatically extracting the narrators' cursor positions. +Quilt-Instruct supports contextual reasoning by extracting diagnosis and +supporting facts from the entire WSI. Using Quilt-Instruct, we train +Quilt-LLaVA, which can reason beyond the given single image patch, enabling +diagnostic reasoning across patches. To evaluate Quilt-LLaVA, we propose a +comprehensive evaluation dataset created from 985 images and 1283 +human-generated question-answers. We also thoroughly evaluate Quilt-LLaVA using +public histopathology datasets, where Quilt-LLaVA significantly outperforms +SOTA by over 10% on relative GPT-4 score and 4% and 9% on open and closed set +VQA. Our code, data, and model are publicly accessible at +quilt-llava.github.io. + +
+
+
+
+
+ + ♻ ☆ Mitigating the Impact of Attribute Editing on Face Recognition + + +
+ Through a large-scale study over diverse face images, we show that facial +attribute editing using modern generative AI models can severely degrade +automated face recognition systems. This degradation persists even with +identity-preserving generative models. To mitigate this issue, we propose two +novel techniques for local and global attribute editing. We empirically ablate +twenty-six facial semantic, demographic and expression-based attributes that +have been edited using state-of-the-art generative models, and evaluate them +using ArcFace and AdaFace matchers on CelebA, CelebAMaskHQ and LFW datasets. +Finally, we use LLaVA, an emerging visual question-answering framework for +attribute prediction to validate our editing techniques. Our methods outperform +the current state-of-the-art at facial editing (BLIP, InstantID) while +improving identity retention by a significant extent. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Diffusion based Zero-shot Medical Image-to-Image Translation for Cross + Modality Segmentation + + +
+ Cross-modality image segmentation aims to segment the target modalities using +a method designed in the source modality. Deep generative models can translate +the target modality images into the source modality, thus enabling +cross-modality segmentation. However, a vast body of existing cross-modality +image translation methods relies on supervised learning. In this work, we aim +to address the challenge of zero-shot learning-based image translation tasks +(extreme scenarios in the target modality is unseen in the training phase). To +leverage generative learning for zero-shot cross-modality image segmentation, +we propose a novel unsupervised image translation method. The framework learns +to translate the unseen source image to the target modality for image +segmentation by leveraging the inherent statistical consistency between +different modalities for diffusion guidance. Our framework captures identical +cross-modality features in the statistical domain, offering diffusion guidance +without relying on direct mappings between the source and target domains. This +advantage allows our method to adapt to changing source domains without the +need for retraining, making it highly practical when sufficient labeled source +domain data is not available. The proposed framework is validated in zero-shot +cross-modality image segmentation tasks through empirical comparisons with +influential generative models, including adversarial-based and diffusion-based +models. + +
+
+ comment: Neurips 2023 Diffusion Workshop +
+
+
+
+
+ + ♻ ☆ Local Neighborhood Features for 3D Classification + + +
+ With advances in deep learning model training strategies, the training of +Point cloud classification methods is significantly improving. For example, +PointNeXt, which adopts prominent training techniques and InvResNet layers into +PointNet++, achieves over 7% improvement on the real-world ScanObjectNN +dataset. However, most of these models use point coordinates features of +neighborhood points mapped to higher dimensional space while ignoring the +neighborhood point features computed before feeding to the network layers. In +this paper, we revisit the PointNeXt model to study the usage and benefit of +such neighborhood point features. We train and evaluate PointNeXt on ModelNet40 +(synthetic), ScanObjectNN (real-world), and a recent large-scale, real-world +grocery dataset, i.e., 3DGrocery100. In addition, we provide an additional +inference strategy of weight averaging the top two checkpoints of PointNeXt to +improve classification accuracy. Together with the abovementioned ideas, we +gain 0.5%, 1%, 4.8%, 3.4%, and 1.6% overall accuracy on the PointNeXt model +with real-world datasets, ScanObjectNN (hardest variant), 3DGrocery100's +Apple10, Fruits, Vegetables, and Packages subsets, respectively. We also +achieve a comparable 0.2% accuracy gain on ModelNet40. + +
+
+
+
+
+ + ♻ ☆ Two-Person Interaction Augmentation with Skeleton Priors + + +
+ Close and continuous interaction with rich contacts is a crucial aspect of +human activities (e.g. hugging, dancing) and of interest in many domains like +activity recognition, motion prediction, character animation, etc. However, +acquiring such skeletal motion is challenging. While direct motion capture is +expensive and slow, motion editing/generation is also non-trivial, as complex +contact patterns with topological and geometric constraints have to be +retained. To this end, we propose a new deep learning method for two-body +skeletal interaction motion augmentation, which can generate variations of +contact-rich interactions with varying body sizes and proportions while +retaining the key geometric/topological relations between two bodies. Our +system can learn effectively from a relatively small amount of data and +generalize to drastically different skeleton sizes. Through exhaustive +evaluation and comparison, we show it can generate high-quality motions, has +strong generalizability and outperforms traditional optimization-based methods +and alternative deep learning solutions. + +
+
+
+
+
+ + ♻ ☆ A dataset of over one thousand computed tomography scans of battery + cells + + +
+ Battery technology is increasingly important for global electrification +efforts. However, batteries are highly sensitive to small manufacturing +variations that can induce reliability or safety issues. An important +technology for battery quality control is computed tomography (CT) scanning, +which is widely used for non-destructive 3D inspection across a variety of +clinical and industrial applications. Historically, however, the utility of CT +scanning for high-volume manufacturing has been limited by its low throughput +as well as the difficulty of handling its large file sizes. In this work, we +present a dataset of over one thousand CT scans of as-produced commercially +available batteries. The dataset spans various chemistries (lithium-ion and +sodium-ion) as well as various battery form factors (cylindrical, pouch, and +prismatic). We evaluate seven different battery types in total. The +manufacturing variability and the presence of battery defects can be observed +via this dataset. This dataset may be of interest to scientists and engineers +working on battery technology, computer vision, or both. + +
+
+
+
+
+ + ♻ ☆ TAM-VT: Transformation-Aware Multi-scale Video Transformer for + Segmentation and Tracking + + +
+ Video Object Segmentation (VOS) has emerged as an increasingly important +problem with availability of larger datasets and more complex and realistic +settings, which involve long videos with global motion (e.g, in egocentric +settings), depicting small objects undergoing both rigid and non-rigid +(including state) deformations. While a number of recent approaches have been +explored for this task, these data characteristics still present challenges. In +this work we propose a novel, clip-based DETR-style encoder-decoder +architecture, which focuses on systematically analyzing and addressing +aforementioned challenges. Specifically, we propose a novel +transformation-aware loss that focuses learning on portions of the video where +an object undergoes significant deformations -- a form of "soft" hard examples +mining. Further, we propose a multiplicative time-coded memory, beyond vanilla +additive positional encoding, which helps propagate context across long videos. +Finally, we incorporate these in our proposed holistic multi-scale video +transformer for tracking via multi-scale memory matching and decoding to ensure +sensitivity and accuracy for long videos and small objects. Our model enables +on-line inference with long videos in a windowed fashion, by breaking the video +into clips and propagating context among them. We illustrate that short clip +length and longer memory with learned time-coding are important design choices +for improved performance. Collectively, these technical contributions enable +our model to achieve new state-of-the-art (SoTA) performance on two complex +egocentric datasets -- VISOR and VOST, while achieving comparable to SoTA +results on the conventional VOS benchmark, DAVIS'17. A series of detailed +ablations validate our design choices as well as provide insights into the +importance of parameter choices and their impact on performance. + +
+
+
+
+
+ + ♻ ☆ Lane Change Classification and Prediction with Action Recognition + Networks ECCV2022 + + +
+ Anticipating lane change intentions of surrounding vehicles is crucial for +efficient and safe driving decision making in an autonomous driving system. +Previous works often adopt physical variables such as driving speed, +acceleration and so forth for lane change classification. However, physical +variables do not contain semantic information. Although 3D CNNs have been +developing rapidly, the number of methods utilising action recognition models +and appearance feature for lane change recognition is low, and they all require +additional information to pre-process data. In this work, we propose an +end-to-end framework including two action recognition methods for lane change +recognition, using video data collected by cameras. Our method achieves the +best lane change classification results using only the RGB video data of the +PREVENTION dataset. Class activation maps demonstrate that action recognition +models can efficiently extract lane change motions. A method to better extract +motion clues is also proposed in this paper. + +
+
+ comment: Accepted to ECCV2022 AVVISION +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 57 + +
+
+
+ + ☆ Humanoid-Gym: Reinforcement Learning for Humanoid Robot with Zero-Shot + Sim2Real Transfer + + +
+ Humanoid-Gym is an easy-to-use reinforcement learning (RL) framework based on +Nvidia Isaac Gym, designed to train locomotion skills for humanoid robots, +emphasizing zero-shot transfer from simulation to the real-world environment. +Humanoid-Gym also integrates a sim-to-sim framework from Isaac Gym to Mujoco +that allows users to verify the trained policies in different physical +simulations to ensure the robustness and generalization of the policies. This +framework is verified by RobotEra's XBot-S (1.2-meter tall humanoid robot) and +XBot-L (1.65-meter tall humanoid robot) in a real-world environment with +zero-shot sim-to-real transfer. The project website and source code can be +found at: https://sites.google.com/view/humanoid-gym/. + +
+
+
+
+
+ + ☆ OtterROS: Picking and Programming an Uncrewed Surface Vessel for + Experimental Field Robotics Research with ROS 2 ICRA 2024 + + +
+ There exist a wide range of options for field robotics research using ground +and aerial mobile robots, but there are comparatively few robust and +research-ready uncrewed surface vessels (USVs). This workshop paper starts with +a snapshot of USVs currently available to the research community and then +describes "OtterROS", an open source ROS 2 solution for the Otter USV. Field +experiments using OtterROS are described, which highlight the utility of the +Otter USV and the benefits of using ROS 2 in aquatic robotics research. For +those interested in USV research, the paper details recommended hardware to run +OtterROS and includes an example ROS 2 package using OtterROS, removing +unnecessary non-recurring engineering from field robotics research activities. + +
+
+ comment: 8 pages, 6 figures. Complete package details at + https://github.com/offroad-robotics/otter_ros. Submitted to Workshop on Field + Robotics at ICRA 2024 +
+
+
+
+
+ + ☆ Robust Control using Control Lyapunov Function and Hamilton-Jacobi + Reachability + + +
+ The paper presents a robust control technique that combines the Control +Lyapunov function and Hamilton-Jacobi Reachability to compute a controller and +its Region of Attraction (ROA). The Control Lyapunov function uses a linear +system model with an assumed additive uncertainty to calculate a control gain +and the level sets of the ROA as a function of the uncertainty. Next, +Hamilton-Jacobi reachability uses the nonlinear model with the modeled +uncertainty, which need not be additive, to compute the backward reachable set +(BRS). Finally, by juxtaposing the level sets of the ROA with BRS, we can +calculate the worst-case additive disturbance and the ROA of the nonlinear +model. We illustrate our approach on a 2D quadcopter tracking trajectory and a +2D quadcopter with height and velocity regulation in simulation. + +
+
+
+
+
+ + ☆ Learning Prehensile Dexterity by Imitating and Emulating State-only + Observations RA-L + + +
+ When humans learn physical skills (e.g., learn to play tennis), we tend to +first observe and learn what an expert is doing. But this is often +insufficient. Therefore, we subsequently engage in practice, where we try to +emulate the expert. Inspired by this observation, we introduce Combining +IMitation and Emulation for Motion Refinement (CIMER) -- a two-stage framework +to learn dexterous prehensile manipulation skills from state-only observations. +CIMER's first stage involves imitation: simultaneously encode the complex +interdependent motions of the robot hand and the object in a structured +dynamical system. This results in a reactive motion generation policy that +provides a reasonable motion prior, but lacks the ability to reason about +contact effects due to the lack of action labels. The second stage involves +emulation: learn a motion refinement policy to make adjustments to the motion +prior of the robot hand such that the desired object motion is reenacted. CIMER +is both task-agnostic (no task-specific reward design or shaping) and +intervention-free (no need for additional teleoperated or labeled +demonstrations). Detailed experiments reveal that i) Imitation alone is +insufficient, but adding emulation drastically improves performance, ii) CIMER +outperforms existing methods in terms of sample efficiency and the ability to +generate realistic and stable motions, iii) CIMER can either zero-shot +generalize or learn to adapt to novel objects from the YCB dataset, even +outperforming expert policies trained with action labels in most cases. + +
+
+ comment: Under review by RA-L +
+
+
+
+
+ + ☆ Design and Simulation of Time-energy Optimal Anti-swing Trajectory + Planner for Autonomous Tower Cranes + + +
+ For autonomous crane lifting, optimal trajectories of the crane are required +as reference inputs to the crane controller to facilitate feedforward control. +Reducing the unactuated payload motion is a crucial issue for under-actuated +tower cranes with spherical pendulum dynamics. The planned trajectory should be +optimal in terms of both operating time and energy consumption, to facilitate +optimum output spending optimum effort. This article proposes an anti-swing +tower crane trajectory planner that can provide time-energy optimal solutions +for the Computer-Aided Lift Planning (CALP) system developed at Nanyang +Technological University, which facilitates collision-free lifting path +planning of robotized tower cranes in autonomous construction sites. The +current work introduces a trajectory planning module to the system that +utilizes the geometric outputs from the path planning module and optimally +scales them with time information. Firstly, analyzing the non-linear dynamics +of the crane operations, the tower crane is established as differentially flat. +Subsequently, the multi-objective trajectory optimization problems for all the +crane operations are formulated in the flat output space through consideration +of the mechanical and safety constraints. Two multi-objective evolutionary +algorithms, namely Non-dominated Sorting Genetic Algorithm (NSGA-II) and +Generalized Differential Evolution 3 (GDE3), are extensively compared via +statistical measures based on the closeness of solutions to the Pareto front, +distribution of solutions in the solution space and the runtime, to select the +optimization engine of the planner. Finally, the crane operation trajectories +are obtained via the corresponding planned flat output trajectories. Studies +simulating real-world lifting scenarios are conducted to verify the +effectiveness and reliability of the proposed module of the lift planning +system. + +
+
+ comment: 18 pages, 12 figures, 9 tables +
+
+
+
+
+ + ☆ Robust STL Control Synthesis under Maximal Disturbance Sets + + +
+ This work addresses maximally robust control synthesis under unknown +disturbances. We consider a general nonlinear system, subject to a Signal +Temporal Logic (STL) specification, and wish to jointly synthesize the maximal +possible disturbance bounds and the corresponding controllers that ensure the +STL specification is satisfied under these bounds. Many works have considered +STL satisfaction under given bounded disturbances. Yet, to the authors' best +knowledge, this is the first work that aims to maximize the permissible +disturbance set and find the corresponding controllers that ensure satisfying +the STL specification with maximum disturbance robustness. We extend the notion +of disturbance-robust semantics for STL, which is a property of a +specification, dynamical system, and controller, and provide an algorithm to +get the maximal disturbance robust controllers satisfying an STL specification +using Hamilton-Jacobi reachability. We show its soundness and provide a +simulation example with an Autonomous Underwater Vehicle (AUV). + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ Taming Transformers for Realistic Lidar Point Cloud Generation + + +
+ Diffusion Models (DMs) have achieved State-Of-The-Art (SOTA) results in the +Lidar point cloud generation task, benefiting from their stable training and +iterative refinement during sampling. However, DMs often fail to realistically +model Lidar raydrop noise due to their inherent denoising process. To retain +the strength of iterative sampling while enhancing the generation of raydrop +noise, we introduce LidarGRIT, a generative model that uses auto-regressive +transformers to iteratively sample the range images in the latent space rather +than image space. Furthermore, LidarGRIT utilises VQ-VAE to separately decode +range images and raydrop masks. Our results show that LidarGRIT achieves +superior performance compared to SOTA models on KITTI-360 and KITTI odometry +datasets. Code available at:https://github.com/hamedhaghighi/LidarGRIT. + +
+
+
+
+
+ + ☆ A Hessian for Gaussian Mixture Likelihoods in Nonlinear Least Squares + + +
+ This paper proposes a novel Hessian approximation for Maximum a Posteriori +estimation problems in robotics involving Gaussian mixture likelihoods. The +proposed Hessian leads to better convergence properties. Previous approaches +manipulate the Gaussian mixture likelihood into a form that allows the problem +to be represented as a nonlinear least squares (NLS) problem. However, they +result in an inaccurate Hessian approximation due to additional nonlinearities +that are not accounted for in NLS solvers. The proposed Hessian approximation +is derived by setting the Hessians of the Gaussian mixture component errors to +zero, which is the same starting point as for the Gauss-Newton Hessian +approximation for NLS, and using the chain rule to account for additional +nonlinearities. The proposed Hessian approximation is more accurate, resulting +in improved convergence properties that are demonstrated on simulated and +real-world experiments. A method to maintain compatibility with existing +solvers, such as ceres, is also presented. Accompanying software and +supplementary material can be found at +https://github.com/decargroup/hessian_sum_mixtures. + +
+
+ comment: 8 pages, 2 figures. Submitted to IEEE Robotics and Automation Letters +
+
+
+
+
+ + ☆ The Open Autonomy Safety Case Framework + + +
+ A system safety case is a compelling, comprehensible, and valid argument +about the satisfaction of the safety goals of a given system operating in a +given environment supported by convincing evidence. Since the publication of UL +4600 in 2020, safety cases have become a best practice for measuring, managing, +and communicating the safety of autonomous vehicles (AVs). Although UL 4600 +provides guidance on how to build the safety case for an AV, the complexity of +AVs and their operating environments, the novelty of the used technology, the +need for complying with various regulations and technical standards, and for +addressing cybersecurity concerns and ethical considerations make the +development of safety cases for AVs challenging. To this end, safety case +frameworks have been proposed that bring strategies, argument templates, and +other guidance together to support the development of a safety case. This paper +introduces the Open Autonomy Safety Case Framework, developed over years of +work with the autonomous vehicle industry, as a roadmap for how AVs can be +deployed safely and responsibly. + +
+
+
+
+
+ + Residual Chain Prediction for Autonomous Driving Path Planning + + +
+ In the rapidly evolving field of autonomous driving systems, the refinement +of path planning algorithms is paramount for navigating vehicles through +dynamic environments, particularly in complex urban scenarios. Traditional path +planning algorithms, which are heavily reliant on static rules and manually +defined parameters, often fall short in such contexts, highlighting the need +for more adaptive, learning-based approaches. Among these, behavior cloning +emerges as a noteworthy strategy for its simplicity and efficiency, especially +within the realm of end-to-end path planning. However, behavior cloning faces +challenges, such as covariate shift when employing traditional Manhattan +distance as the metric. Addressing this, our study introduces the novel concept +of Residual Chain Loss. Residual Chain Loss dynamically adjusts the loss +calculation process to enhance the temporal dependency and accuracy of +predicted path points, significantly improving the model's performance without +additional computational overhead. Through testing on the nuScenes dataset, we +underscore the method's substantial advancements in addressing covariate shift, +facilitating dynamic loss adjustments, and ensuring seamless integration with +end-to-end path planning frameworks. Our findings highlight the potential of +Residual Chain Loss to revolutionize planning component of autonomous driving +systems, marking a significant step forward in the quest for level 5 autonomous +driving system. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ☆ Semi-Supervised Novelty Detection for Precise Ultra-Wideband Error + Signal Prediction + + +
+ Ultra-Wideband (UWB) technology is an emerging low-cost solution for +localization in a generic environment. However, UWB signal can be affected by +signal reflections and non-line-of-sight (NLoS) conditions between anchors; +hence, in a broader sense, the specific geometry of the environment and the +disposition of obstructing elements in the map may drastically hinder the +reliability of UWB for precise robot localization. This work aims to mitigate +this problem by learning a map-specific characterization of the UWB quality +signal with a fingerprint semi-supervised novelty detection methodology. An +unsupervised autoencoder neural network is trained on nominal UWB map +conditions, and then it is used to predict errors derived from the introduction +of perturbing novelties in the environment. This work poses a step change in +the understanding of UWB localization and its reliability in evolving +environmental conditions. The resulting performance of the proposed method is +proved by fine-grained experiments obtained with a visual tracking ground +truth. + +
+
+
+
+
+ + ☆ Non-linear Model Predictive Control for Multi-task GPS-free Autonomous + Navigation in Vineyards + + +
+ Autonomous navigation is the foundation of agricultural robots. This paper +focuses on developing an advanced autonomous navigation system for a rover +operating within row-based crops. A position-agnostic system is proposed to +address the challenging situation when standard localization methods, like GPS, +fail due to unfavorable weather or obstructed signals. This breakthrough is +especially vital in densely vegetated regions, including areas covered by thick +tree canopies or pergola vineyards. This work proposed a novel system that +leverages a single RGB-D camera and a Non-linear Model Predictive Control +strategy to navigate through entire rows, adapting to various crop spacing. The +presented solution demonstrates versatility in handling diverse crop densities, +environmental factors, and multiple navigation tasks to support agricultural +activities at an extremely cost-effective implementation. Experimental +validation in simulated and real vineyards underscores the system's robustness +and competitiveness in both standard row traversal and target objects approach. + +
+
+
+
+
+ + ☆ GPS-free Autonomous Navigation in Cluttered Tree Rows with Deep Semantic + Segmentation + + +
+ Segmentation-based autonomous navigation has recently been presented as an +appealing approach to guiding robotic platforms through crop rows without +requiring perfect GPS localization. Nevertheless, current techniques are +restricted to situations where the distinct separation between the plants and +the sky allows for the identification of the row's center. However, tall, dense +vegetation, such as high tree rows and orchards, is the primary cause of GPS +signal blockage. In this study, we increase the overall robustness and +adaptability of the control algorithm by extending the segmentation-based +robotic guiding to those cases where canopies and branches occlude the sky and +prevent the utilization of GPS and earlier approaches. An efficient Deep Neural +Network architecture has been used to address semantic segmentation, performing +the training with synthetic data only. Numerous vineyards and tree fields have +undergone extensive testing in both simulation and real-world to show the +solution's competitive benefits. + +
+
+
+
+
+ + ☆ Stochastic Online Optimization for Cyber-Physical and Robotic Systems + + +
+ We propose a novel gradient-based online optimization framework for solving +stochastic programming problems that frequently arise in the context of +cyber-physical and robotic systems. Our problem formulation accommodates +constraints that model the evolution of a cyber-physical system, which has, in +general, a continuous state and action space, is nonlinear, and where the state +is only partially observed. We also incorporate an approximate model of the +dynamics as prior knowledge into the learning process and show that even rough +estimates of the dynamics can significantly improve the convergence of our +algorithms. Our online optimization framework encompasses both gradient descent +and quasi-Newton methods, and we provide a unified convergence analysis of our +algorithms in a non-convex setting. We also characterize the impact of modeling +errors in the system dynamics on the convergence rate of the algorithms. +Finally, we evaluate our algorithms in simulations of a flexible beam, a +four-legged walking robot, and in real-world experiments with a ping-pong +playing robot. + +
+
+ comment: 46 pages, 16 figures +
+
+
+
+
+ + ☆ CLIPping the Limits: Finding the Sweet Spot for Relevant Images in + Automated Driving Systems Perception Testing + + +
+ Perception systems, especially cameras, are the eyes of automated driving +systems. Ensuring that they function reliably and robustly is therefore an +important building block in the automation of vehicles. There are various +approaches to test the perception of automated driving systems. Ultimately, +however, it always comes down to the investigation of the behavior of +perception systems under specific input data. Camera images are a crucial part +of the input data. Image data sets are therefore collected for the testing of +automated driving systems, but it is non-trivial to find specific images in +these data sets. Thanks to recent developments in neural networks, there are +now methods for sorting the images in a data set according to their similarity +to a prompt in natural language. In order to further automate the provision of +search results, we make a contribution by automating the threshold definition +in these sorted results and returning only the images relevant to the prompt as +a result. Our focus is on preventing false positives and false negatives +equally. It is also important that our method is robust and in the case that +our assumptions are not fulfilled, we provide a fallback solution. + +
+
+
+
+
+ + ☆ Human Detection from 4D Radar Data in Low-Visibility Field Conditions ICRA 2024 + + +
+ Autonomous driving technology is increasingly being used on public roads and +in industrial settings such as mines. While it is essential to detect +pedestrians, vehicles, or other obstacles, adverse field conditions negatively +affect the performance of classical sensors such as cameras or lidars. Radar, +on the other hand, is a promising modality that is less affected by, e.g., +dust, smoke, water mist or fog. In particular, modern 4D imaging radars provide +target responses across the range, vertical angle, horizontal angle and Doppler +velocity dimensions. We propose TMVA4D, a CNN architecture that leverages this +4D radar modality for semantic segmentation. The CNN is trained to distinguish +between the background and person classes based on a series of 2D projections +of the 4D radar data that include the elevation, azimuth, range, and Doppler +velocity dimensions. We also outline the process of compiling a novel dataset +consisting of data collected in industrial settings with a car-mounted 4D radar +and describe how the ground-truth labels were generated from reference thermal +images. Using TMVA4D on this dataset, we achieve an mIoU score of 78.2% and an +mDice score of 86.1%, evaluated on the two classes background and person + +
+
+ comment: Submitted to Radar in Robotics workshop at ICRA 2024 +
+
+
+
+
+ + ☆ Online Learning of Joint-Muscle Mapping Using Vision in Tendon-driven + Musculoskeletal Humanoids + + +
+ The body structures of tendon-driven musculoskeletal humanoids are complex, +and accurate modeling is difficult, because they are made by imitating the body +structures of human beings. For this reason, we have not been able to move them +accurately like ordinary humanoids driven by actuators in each axis, and large +internal muscle tension and slack of tendon wires have emerged by the model +error between its geometric model and the actual robot. Therefore, we construct +a joint-muscle mapping (JMM) using a neural network (NN), which expresses a +nonlinear relationship between joint angles and muscle lengths, and aim to move +tendon-driven musculoskeletal humanoids accurately by updating the JMM online +from data of the actual robot. In this study, the JMM is updated online by +using the vision of the robot so that it moves to the correct position (Vision +Updater). Also, we execute another update to modify muscle antagonisms +correctly (Antagonism Updater). By using these two updaters, the error between +the target and actual joint angles decrease to about 40% in 5 minutes, and we +show through a manipulation experiment that the tendon-driven musculoskeletal +humanoid Kengoro becomes able to move as intended. This novel system can adapt +to the state change and growth of robots, because it updates the JMM online +successively. + +
+
+ comment: Accepted at IEEE Robotics and Automation Letters, 2018 +
+
+
+
+
+ + ☆ Long-time Self-body Image Acquisition and its Application to the Control + of Musculoskeletal Structures + + +
+ The tendon-driven musculoskeletal humanoid has many benefits that human +beings have, but the modeling of its complex muscle and bone structures is +difficult and conventional model-based controls cannot realize intended +movements. Therefore, a learning control mechanism that acquires nonlinear +relationships between joint angles, muscle tensions, and muscle lengths from +the actual robot is necessary. In this study, we propose a system which runs +the learning control mechanism for a long time to keep the self-body image of +the musculoskeletal humanoid correct at all times. Also, we show that the +musculoskeletal humanoid can conduct position control, torque control, and +variable stiffness control using this self-body image. We conduct a long-time +self-body image acquisition experiment lasting 3 hours, evaluate variable +stiffness control using the self-body image, etc., and discuss the superiority +and practicality of the self-body image acquisition of musculoskeletal +structures, comprehensively. + +
+
+ comment: Accepted at IEEE Robotics and Automation Letters, 2019 +
+
+
+
+
+ + ☆ Long-horizon Locomotion and Manipulation on a Quadrupedal Robot with + Large Language Models + + +
+ We present a large language model (LLM) based system to empower quadrupedal +robots with problem-solving abilities for long-horizon tasks beyond short-term +motions. Long-horizon tasks for quadrupeds are challenging since they require +both a high-level understanding of the semantics of the problem for task +planning and a broad range of locomotion and manipulation skills to interact +with the environment. Our system builds a high-level reasoning layer with large +language models, which generates hybrid discrete-continuous plans as robot code +from task descriptions. It comprises multiple LLM agents: a semantic planner +for sketching a plan, a parameter calculator for predicting arguments in the +plan, and a code generator to convert the plan into executable robot code. At +the low level, we adopt reinforcement learning to train a set of motion +planning and control skills to unleash the flexibility of quadrupeds for rich +environment interactions. Our system is tested on long-horizon tasks that are +infeasible to complete with one single skill. Simulation and real-world +experiments show that it successfully figures out multi-step strategies and +demonstrates non-trivial behaviors, including building tools or notifying a +human for help. + +
+
+
+
+
+ + ☆ Online Self-body Image Acquisition Considering Changes in Muscle Routes + Caused by Softness of Body Tissue for Tendon-driven Musculoskeletal Humanoids IROS2018 + + +
+ Tendon-driven musculoskeletal humanoids have many benefits in terms of the +flexible spine, multiple degrees of freedom, and variable stiffness. At the +same time, because of its body complexity, there are problems in +controllability. First, due to the large difference between the actual robot +and its geometric model, it cannot move as intended and large internal muscle +tension may emerge. Second, movements which do not appear as changes in muscle +lengths may emerge, because of the muscle route changes caused by softness of +body tissue. To solve these problems, we construct two models: ideal +joint-muscle model and muscle-route change model, using a neural network. We +initialize these models by a man-made geometric model and update them online +using the sensor information of the actual robot. We validate that the +tendon-driven musculoskeletal humanoid Kengoro is able to obtain a correct +self-body image through several experiments. + +
+
+ comment: Accepted at IROS2018 +
+
+
+
+
+ + ☆ Robust Anthropomorphic Robotic Manipulation through Biomimetic + Distributed Compliance + + +
+ The impressive capabilities of humans to robustly perform manipulation relies +on compliant interactions, enabled through the structure and materials +spatially distributed in our hands. We propose by mimicking this distributed +compliance in an anthropomorphic robotic hand, the open-loop manipulation +robustness increases and observe the emergence of human-like behaviours. To +achieve this, we introduce the ADAPT Hand equipped with tunable compliance +throughout the skin, fingers, and the wrist. Through extensive automated +pick-and-place tests, we show the grasping robustness closely mirrors an +estimated geometric theoretical limit, while `stress-testing' the robot hand to +perform 800+ grasps. Finally, 24 items with largely varying geometries are +grasped in a constrained environment with a success rate of 93\%. We +demonstrate the hand-object self-organization behavior underlines this extreme +robustness, where the hand automatically exhibits different grasp types +depending on object geometries. Furthermore, the robot grasp type mimics a +natural human grasp with a direct similarity of 68\%. + +
+
+
+
+
+ + ☆ SAFE-GIL: SAFEty Guided Imitation Learning + + +
+ Behavior Cloning is a popular approach to Imitation Learning, in which a +robot observes an expert supervisor and learns a control policy. However, +behavior cloning suffers from the "compounding error" problem - the policy +errors compound as it deviates from the expert demonstrations and might lead to +catastrophic system failures, limiting its use in safety-critical applications. +On-policy data aggregation methods are able to address this issue at the cost +of rolling out and repeated training of the imitation policy, which can be +tedious and computationally prohibitive. We propose SAFE-GIL, an off-policy +behavior cloning method that guides the expert via adversarial disturbance +during data collection. The algorithm abstracts the imitation error as an +adversarial disturbance in the system dynamics, injects it during data +collection to expose the expert to safety critical states, and collects +corrective actions. Our method biases training to more closely replicate expert +behavior in safety-critical states and allows more variance in less critical +states. We compare our method with several behavior cloning techniques and +DAgger on autonomous navigation and autonomous taxiing tasks and show higher +task success and safety, especially in low data regimes where the likelihood of +error is higher, at a slight drop in the performance. + +
+
+
+
+
+ + ☆ Collision-Free Trajectory Optimization in Cluttered Environments with + Sums-of-Squares Programming + + +
+ In this work, we propose a trajectory optimization approach for robot +navigation in cluttered 3D environments. We represent the robot's geometry as a +semialgebraic set defined by polynomial inequalities such that robots with +general shapes can be suitably characterized. To address the robot navigation +task in obstacle-dense environments, we exploit the free space directly to +construct a sequence of free regions, and allocate each waypoint on the +trajectory to a specific region. Then, we incorporate a uniform scaling factor +for each free region, and formulate a Sums-of-Squares (SOS) optimization +problem that renders the containment relationship between the robot and the +free space computationally tractable. The SOS optimization problem is further +reformulated to a semidefinite program (SDP), and the collision-free +constraints are shown to be equivalent to limiting the scaling factor along the +entire trajectory. In this context, the robot at a specific configuration is +tailored to stay within the free region. Next, to solve the trajectory +optimization problem with the proposed safety constraints (which are implicitly +dependent on the robot configurations), we derive the analytical solution to +the gradient of the minimum scaling factor with respect to the robot +configuration. As a result, this seamlessly facilitates the use of +gradient-based methods in efficient solving of the trajectory optimization +problem. Through a series of simulations and real-world experiments, the +proposed trajectory optimization approach is validated in various challenging +scenarios, and the results demonstrate its effectiveness in generating +collision-free trajectories in dense and intricate environments populated with +obstacles. + +
+
+
+
+
+ + ☆ MeSA-DRL: Memory-Enhanced Deep Reinforcement Learning for Advanced + Socially Aware Robot Navigation in Crowded Environments + + +
+ Autonomous navigation capabilities play a critical role in service robots +operating in environments where human interactions are pivotal, due to the +dynamic and unpredictable nature of these environments. However, the +variability in human behavior presents a substantial challenge for robots in +predicting and anticipating movements, particularly in crowded scenarios. To +address this issue, a memory-enabled deep reinforcement learning framework is +proposed for autonomous robot navigation in diverse pedestrian scenarios. The +proposed framework leverages long-term memory to retain essential information +about the surroundings and model sequential dependencies effectively. The +importance of human-robot interactions is also encoded to assign higher +attention to these interactions. A global planning mechanism is incorporated +into the memory-enabled architecture. Additionally, a multi-term reward system +is designed to prioritize and encourage long-sighted robot behaviors by +incorporating dynamic warning zones. Simultaneously, it promotes smooth +trajectories and minimizes the time taken to reach the robot's desired goal. +Extensive simulation experiments show that the suggested approach outperforms +representative state-of-the-art methods, showcasing its ability to a navigation +efficiency and safety in real-world scenarios. + +
+
+
+
+
+ + ☆ LGSDF: Continual Global Learning of Signed Distance Fields Aided by + Local Updating + + +
+ Implicit reconstruction of ESDF (Euclidean Signed Distance Field) involves +training a neural network to regress the signed distance from any point to the +nearest obstacle, which has the advantages of lightweight storage and +continuous querying. However, existing algorithms usually rely on conflicting +raw observations as training data, resulting in poor map performance. In this +paper, we propose LGSDF, an ESDF continual Global learning algorithm aided by +Local updating. At the front end, axis-aligned grids are dynamically updated by +pre-processed sensor observations, where incremental fusion alleviates +estimation error caused by limited viewing directions. At the back end, a +randomly initialized implicit ESDF neural network performs continual +self-supervised learning guided by these grids to generate smooth and +continuous maps. The results on multiple scenes show that LGSDF can construct +more accurate ESDF maps and meshes compared with SOTA (State Of The Art) +explicit and implicit mapping algorithms. The source code of LGSDF is publicly +available at https://github.com/BIT-DYN/LGSDF. + +
+
+
+
+
+ + ☆ Rendering-Enhanced Automatic Image-to-Point Cloud Registration for + Roadside Scenes + + +
+ Prior point cloud provides 3D environmental context, which enhances the +capabilities of monocular camera in downstream vision tasks, such as 3D object +detection, via data fusion. However, the absence of accurate and automated +registration methods for estimating camera extrinsic parameters in roadside +scene point clouds notably constrains the potential applications of roadside +cameras. This paper proposes a novel approach for the automatic registration +between prior point clouds and images from roadside scenes. The main idea +involves rendering photorealistic grayscale views taken at specific +perspectives from the prior point cloud with the help of their features like +RGB or intensity values. These generated views can reduce the modality +differences between images and prior point clouds, thereby improve the +robustness and accuracy of the registration results. Particularly, we specify +an efficient algorithm, named neighbor rendering, for the rendering process. +Then we introduce a method for automatically estimating the initial guess using +only rough guesses of camera's position. At last, we propose a procedure for +iteratively refining the extrinsic parameters by minimizing the reprojection +error for line features extracted from both generated and camera images using +Segment Anything Model (SAM). We assess our method using a self-collected +dataset, comprising eight cameras strategically positioned throughout the +university campus. Experiments demonstrate our method's capability to +automatically align prior point cloud with roadside camera image, achieving a +rotation accuracy of 0.202 degrees and a translation precision of 0.079m. +Furthermore, we validate our approach's effectiveness in visual applications by +substantially improving monocular 3D object detection performance. + +
+
+
+
+
+ + ☆ STITCH: Augmented Dexterity for Suture Throws Including Thread + Coordination and Handoffs + + +
+ We present STITCH: an augmented dexterity pipeline that performs Suture +Throws Including Thread Coordination and Handoffs. STITCH iteratively performs +needle insertion, thread sweeping, needle extraction, suture cinching, needle +handover, and needle pose correction with failure recovery policies. We +introduce a novel visual 6D needle pose estimation framework using a stereo +camera pair and new suturing motion primitives. We compare STITCH to baselines, +including a proprioception-only and a policy without visual servoing. In +physical experiments across 15 trials, STITCH achieves an average of 2.93 +sutures without human intervention and 4.47 sutures with human intervention. +See https://sites.google.com/berkeley.edu/stitch for code and supplemental +materials. + +
+
+
+
+
+ + ☆ Better Monocular 3D Detectors with LiDAR from the Past ICRA 2022 + + +
+ Accurate 3D object detection is crucial to autonomous driving. Though +LiDAR-based detectors have achieved impressive performance, the high cost of +LiDAR sensors precludes their widespread adoption in affordable vehicles. +Camera-based detectors are cheaper alternatives but often suffer inferior +performance compared to their LiDAR-based counterparts due to inherent depth +ambiguities in images. In this work, we seek to improve monocular 3D detectors +by leveraging unlabeled historical LiDAR data. Specifically, at inference time, +we assume that the camera-based detectors have access to multiple unlabeled +LiDAR scans from past traversals at locations of interest (potentially from +other high-end vehicles equipped with LiDAR sensors). Under this setup, we +proposed a novel, simple, and end-to-end trainable framework, termed +AsyncDepth, to effectively extract relevant features from asynchronous LiDAR +traversals of the same location for monocular 3D detectors. We show consistent +and significant performance gain (up to 9 AP) across multiple state-of-the-art +models and datasets with a negligible additional latency of 9.66 ms and a small +storage cost. + +
+
+ comment: Accepted by ICRA 2022. The code can be found at + https://github.com/YurongYou/AsyncDepth +
+
+
+
+
+ + ☆ LLM-BT: Performing Robotic Adaptive Tasks based on Large Language Models + and Behavior Trees ICRA 2024 + + +
+ Large Language Models (LLMs) have been widely utilized to perform complex +robotic tasks. However, handling external disturbances during tasks is still an +open challenge. This paper proposes a novel method to achieve robotic adaptive +tasks based on LLMs and Behavior Trees (BTs). It utilizes ChatGPT to reason the +descriptive steps of tasks. In order to enable ChatGPT to understand the +environment, semantic maps are constructed by an object recognition algorithm. +Then, we design a Parser module based on Bidirectional Encoder Representations +from Transformers (BERT) to parse these steps into initial BTs. Subsequently, a +BTs Update algorithm is proposed to expand the initial BTs dynamically to +control robots to perform adaptive tasks. Different from other LLM-based +methods for complex robotic tasks, our method outputs variable BTs that can add +and execute new actions according to environmental changes, which is robust to +external disturbances. Our method is validated with simulation in different +practical scenarios. + +
+
+ comment: 7 pages, 11figures, WILL PUBLISHED ON ICRA 2024 +
+
+
+
+
+ + ☆ Rollbot: a Spherical Robot Driven by a Single Actuator IROS 2024 + + +
+ Here we present Rollbot, the first spherical robot capable of controllably +maneuvering on 2D plane with a single actuator. Rollbot rolls on the ground in +circular pattern and controls its motion by changing the curvature of the +trajectory through accelerating and decelerating its single motor and attached +mass. We present the theoretical analysis, design, and control of Rollbot, and +demonstrate its ability to move in a controllable circular pattern and follow +waypoints. + +
+
+ comment: Submission to IROS 2024 +
+
+
+
+
+ + ☆ A Realistic Surgical Simulator for Non-Rigid and Contact-Rich + Manipulation in Surgeries with the da Vinci Research Kit + + +
+ Realistic real-time surgical simulators play an increasingly important role +in surgical robotics research, such as surgical robot learning and automation, +and surgical skills assessment. Although there are a number of existing +surgical simulators for research, they generally lack the ability to simulate +the diverse types of objects and contact-rich manipulation tasks typically +present in surgeries, such as tissue cutting and blood suction. In this work, +we introduce CRESSim, a realistic surgical simulator based on PhysX 5 for the +da Vinci Research Kit (dVRK) that enables simulating various contact-rich +surgical tasks involving different surgical instruments, soft tissue, and body +fluids. The real-world dVRK console and the master tool manipulator (MTM) +robots are incorporated into the system to allow for teleoperation through +virtual reality (VR). To showcase the advantages and potentials of the +simulator, we present three examples of surgical tasks, including tissue +grasping and deformation, blood suction, and tissue cutting. These tasks are +performed using the simulated surgical instruments, including the large needle +driver, suction irrigator, and curved scissor, through VR-based teleoperation. + +
+
+ comment: 7 pages, 21st International Conference on Ubiquitous Robots (UR + 2024), accepted +
+
+
+
+
+ + ☆ On the Fly Robotic-Assisted Medical Instrument Planning and Execution + Using Mixed Reality ICRA 2024 + + +
+ Robotic-assisted medical systems (RAMS) have gained significant attention for +their advantages in alleviating surgeons' fatigue and improving patients' +outcomes. These systems comprise a range of human-computer interactions, +including medical scene monitoring, anatomical target planning, and robot +manipulation. However, despite its versatility and effectiveness, RAMS demands +expertise in robotics, leading to a high learning cost for the operator. In +this work, we introduce a novel framework using mixed reality technologies to +ease the use of RAMS. The proposed framework achieves real-time planning and +execution of medical instruments by providing 3D anatomical image overlay, +human-robot collision detection, and robot programming interface. These +features, integrated with an easy-to-use calibration method for head-mounted +display, improve the effectiveness of human-robot interactions. To assess the +feasibility of the framework, two medical applications are presented in this +work: 1) coil placement during transcranial magnetic stimulation and 2) drill +and injector device positioning during femoroplasty. Results from these use +cases demonstrate its potential to extend to a wider range of medical +scenarios. + +
+
+ comment: This paper has been accepted to IEEE ICRA 2024 as a contributed paper +
+
+
+
+
+ + ☆ GBEC: Geometry-Based Hand-Eye Calibration ICRA 2024 + + +
+ Hand-eye calibration is the problem of solving the transformation from the +end-effector of a robot to the sensor attached to it. Commonly employed +techniques, such as AXXB or AXZB formulations, rely on regression methods that +require collecting pose data from different robot configurations, which can +produce low accuracy and repeatability. However, the derived transformation +should solely depend on the geometry of the end-effector and the sensor +attachment. We propose Geometry-Based End-Effector Calibration (GBEC) that +enhances the repeatability and accuracy of the derived transformation compared +to traditional hand-eye calibrations. To demonstrate improvements, we apply the +approach to two different robot-assisted procedures: Transcranial Magnetic +Stimulation (TMS) and femoroplasty. We also discuss the generalizability of +GBEC for camera-in-hand and marker-in-hand sensor mounting methods. In the +experiments, we perform GBEC between the robot end-effector and an optical +tracker's rigid body marker attached to the TMS coil or femoroplasty drill +guide. Previous research documents low repeatability and accuracy of the +conventional methods for robot-assisted TMS hand-eye calibration. When compared +to some existing methods, the proposed method relies solely on the geometry of +the flange and the pose of the rigid-body marker, making it independent of +workspace constraints or robot accuracy, without sacrificing the orthogonality +of the rotation matrix. Our results validate the accuracy and applicability of +the approach, providing a new and generalizable methodology for obtaining the +transformation from the end-effector to a sensor. + +
+
+ comment: This paper has been accepted to IEEE ICRA 2024 as a contributed paper +
+
+
+
+
+ + ☆ CoBT: Collaborative Programming of Behaviour Trees from One + Demonstration for Robot Manipulation ICRA 2024 + + +
+ Mass customization and shorter manufacturing cycles are becoming more +important among small and medium-sized companies. However, classical industrial +robots struggle to cope with product variation and dynamic environments. In +this paper, we present CoBT, a collaborative programming by demonstration +framework for generating reactive and modular behavior trees. CoBT relies on a +single demonstration and a combination of data-driven machine learning methods +with logic-based declarative learning to learn a task, thus eliminating the +need for programming expertise or long development times. The proposed +framework is experimentally validated on 7 manipulation tasks and we show that +CoBT achieves approx. 93% success rate overall with an average of 7.5s +programming time. We conduct a pilot study with non-expert users to provide +feedback regarding the usability of CoBT. + +
+
+ comment: Accepted for presentation at IEEE ICRA 2024 +
+
+
+
+
+ + ☆ A Neuromorphic Approach to Obstacle Avoidance in Robot Manipulation IJRR + + +
+ Neuromorphic computing mimics computational principles of the brain in +$\textit{silico}$ and motivates research into event-based vision and spiking +neural networks (SNNs). Event cameras (ECs) exclusively capture local intensity +changes and offer superior power consumption, response latencies, and dynamic +ranges. SNNs replicate biological neuronal dynamics and have demonstrated +potential as alternatives to conventional artificial neural networks (ANNs), +such as in reducing energy expenditure and inference time in visual +classification. Nevertheless, these novel paradigms remain scarcely explored +outside the domain of aerial robots. + To investigate the utility of brain-inspired sensing and data processing, we +developed a neuromorphic approach to obstacle avoidance on a camera-equipped +manipulator. Our approach adapts high-level trajectory plans with reactive +maneuvers by processing emulated event data in a convolutional SNN, decoding +neural activations into avoidance motions, and adjusting plans using a dynamic +motion primitive. We conducted experiments with a Kinova Gen3 arm performing +simple reaching tasks that involve obstacles in sets of distinct task scenarios +and in comparison to a non-adaptive baseline. + Our neuromorphic approach facilitated reliable avoidance of imminent +collisions in simulated and real-world experiments, where the baseline +consistently failed. Trajectory adaptations had low impacts on safety and +predictability criteria. Among the notable SNN properties were the correlation +of computations with the magnitude of perceived motions and a robustness to +different event emulation methods. Tests with a DAVIS346 EC showed similar +performance, validating our experimental event emulation. Our results motivate +incorporating SNN learning, utilizing neuromorphic processors, and further +exploring the potential of neuromorphic methods. + +
+
+ comment: 35 pages, accepted at IJRR, authors' version +
+
+
+
+
+ + ☆ ITA-ECBS: A Bounded-Suboptimal Algorithm for Combined Target-Assignment + and Path-Finding Problem + + +
+ Multi-Agent Path Finding (MAPF), i.e., finding collision-free paths for +multiple robots, plays a critical role in many applications. Sometimes, +assigning a specific target to each agent also presents a challenge. The +Combined Target-Assignment and Path-Finding (TAPF) problem, a variant of MAPF, +requires simultaneously assigning targets to agents and planning collision-free +paths. Several algorithms, including CBM, CBS-TA, and ITA-CBS, can optimally +solve the TAPF problem, with ITA-CBS being the leading method of flowtime. +However, the only existing suboptimal method ECBS-TA, is derived from CBS-TA +rather than ITA-CBS, and adapting the optimal ITA-CBS method to its +bounded-suboptimal variant is a challenge due to the variability of target +assignment solutions in different search nodes. We introduce ITA-ECBS as the +first bounded-suboptimal variant of ITA-CBS. ITA-ECBS employs focal search to +enhance efficiency and determines target assignments based on a new lower bound +matrix. We show that ITA-ECBS outperforms the baseline method ECBS-TA in 87.42% +of 54,033 test cases. + +
+
+
+
+
+ + ☆ GPS-free Autonomous Navigation in Cluttered Tree Rows with Deep Semantic + Segmentation + + +
+ Segmentation-based autonomous navigation has recently been presented as an +appealing approach to guiding robotic platforms through crop rows without +requiring perfect GPS localization. Nevertheless, current techniques are +restricted to situations where the distinct separation between the plants and +the sky allows for the identification of the row's center. However, tall, dense +vegetation, such as high tree rows and orchards, is the primary cause of GPS +signal blockage. In this study, we increase the overall robustness and +adaptability of the control algorithm by extending the segmentation-based +robotic guiding to those cases where canopies and branches occlude the sky and +prevent the utilization of GPS and earlier approaches. An efficient Deep Neural +Network architecture has been used to address semantic segmentation, performing +the training with synthetic data only. Numerous vineyards and tree fields have +undergone extensive testing in both simulation and real-world to show the +solution's competitive benefits. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2304.08988 +
+
+
+
+
+ + ♻ ☆ Shared Autonomy via Variable Impedance Control and Virtual Potential + Fields for Encoding Human Demonstration ICRA 2024 + + +
+ This article introduces a framework for complex human-robot collaboration +tasks, such as the co-manufacturing of furniture. For these tasks, it is +essential to encode tasks from human demonstration and reproduce these skills +in a compliant and safe manner. Therefore, two key components are addressed in +this work: motion generation and shared autonomy. We propose a motion generator +based on a time-invariant potential field, capable of encoding wrench profiles, +complex and closed-loop trajectories, and additionally incorporates obstacle +avoidance. Additionally, the paper addresses shared autonomy (SA) which enables +synergetic collaboration between human operators and robots by dynamically +allocating authority. Variable impedance control (VIC) and force control are +employed, where impedance and wrench are adapted based on the human-robot +autonomy factor derived from interaction forces. System passivity is ensured by +an energy-tank based task passivation strategy. The framework's efficacy is +validated through simulations and an experimental study employing a Franka +Emika Research 3 robot. More information can be found on the project website +https://shailjadav.github.io/SALADS/ + +
+
+ comment: Accepted to ICRA 2024. More information can be found on the project + website https://shailjadav.github.io/SALADS/ +
+
+
+
+
+ + ♻ ☆ A Unified Masked Autoencoder with Patchified Skeletons for Motion + Synthesis + + +
+ The synthesis of human motion has traditionally been addressed through +task-dependent models that focus on specific challenges, such as predicting +future motions or filling in intermediate poses conditioned on known key-poses. +In this paper, we present a novel task-independent model called UNIMASK-M, +which can effectively address these challenges using a unified architecture. +Our model obtains comparable or better performance than the state-of-the-art in +each field. Inspired by Vision Transformers (ViTs), our UNIMASK-M model +decomposes a human pose into body parts to leverage the spatio-temporal +relationships existing in human motion. Moreover, we reformulate various +pose-conditioned motion synthesis tasks as a reconstruction problem with +different masking patterns given as input. By explicitly informing our model +about the masked joints, our UNIMASK-M becomes more robust to occlusions. +Experimental results show that our model successfully forecasts human motion on +the Human3.6M dataset. Moreover, it achieves state-of-the-art results in motion +inbetweening on the LaFAN1 dataset, particularly in long transition periods. +More information can be found on the project website +https://evm7.github.io/UNIMASKM-page/ + +
+
+ comment: Accepted to AAAI2024. Webpage: https://evm7.github.io/UNIMASKM-page/ +
+
+
+
+
+ + ♻ ☆ HOI4ABOT: Human-Object Interaction Anticipation for Human Intention + Reading Collaborative roBOTs + + +
+ Robots are becoming increasingly integrated into our lives, assisting us in +various tasks. To ensure effective collaboration between humans and robots, it +is essential that they understand our intentions and anticipate our actions. In +this paper, we propose a Human-Object Interaction (HOI) anticipation framework +for collaborative robots. We propose an efficient and robust transformer-based +model to detect and anticipate HOIs from videos. This enhanced anticipation +empowers robots to proactively assist humans, resulting in more efficient and +intuitive collaborations. Our model outperforms state-of-the-art results in HOI +detection and anticipation in VidHOI dataset with an increase of 1.76% and +1.04% in mAP respectively while being 15.4 times faster. We showcase the +effectiveness of our approach through experimental results in a real robot, +demonstrating that the robot's ability to anticipate HOIs is key for better +Human-Robot Interaction. More information can be found on our project webpage: +https://evm7.github.io/HOI4ABOT_page/ + +
+
+ comment: Proceedings in Conference on Robot Learning 2023. Webpage: + https://evm7.github.io/HOI4ABOT_page/ +
+
+
+
+
+ + ♻ ☆ ImitationNet: Unsupervised Human-to-Robot Motion Retargeting via Shared + Latent Space + + +
+ This paper introduces a novel deep-learning approach for human-to-robot +motion retargeting, enabling robots to mimic human poses accurately. Contrary +to prior deep-learning-based works, our method does not require paired +human-to-robot data, which facilitates its translation to new robots. First, we +construct a shared latent space between humans and robots via adaptive +contrastive learning that takes advantage of a proposed cross-domain similarity +metric between the human and robot poses. Additionally, we propose a +consistency term to build a common latent space that captures the similarity of +the poses with precision while allowing direct robot motion control from the +latent space. For instance, we can generate in-between motion through simple +linear interpolation between two projected human poses. We conduct a +comprehensive evaluation of robot control from diverse modalities (i.e., texts, +RGB videos, and key poses), which facilitates robot control for non-expert +users. Our model outperforms existing works regarding human-to-robot +retargeting in terms of efficiency and precision. Finally, we implemented our +method in a real robot with self-collision avoidance through a whole-body +controller to showcase the effectiveness of our approach. More information on +our website https://evm7.github.io/UnsH2R/ + +
+
+ comment: Accepted to Humanoids 2023. Website: https://evm7.github.io/UnsH2R/ +
+
+
+
+
+ + ♻ ☆ Robot Interaction Behavior Generation based on Social Motion Forecasting + for Human-Robot Interaction ICRA 2024 + + +
+ Integrating robots into populated environments is a complex challenge that +requires an understanding of human social dynamics. In this work, we propose to +model social motion forecasting in a shared human-robot representation space, +which facilitates us to synthesize robot motions that interact with humans in +social scenarios despite not observing any robot in the motion training. We +develop a transformer-based architecture called ECHO, which operates in the +aforementioned shared space to predict the future motions of the agents +encountered in social scenarios. Contrary to prior works, we reformulate the +social motion problem as the refinement of the predicted individual motions +based on the surrounding agents, which facilitates the training while allowing +for single-motion forecasting when only one human is in the scene. We evaluate +our model in multi-person and human-robot motion forecasting tasks and obtain +state-of-the-art performance by a large margin while being efficient and +performing in real-time. Additionally, our qualitative results showcase the +effectiveness of our approach in generating human-robot interaction behaviors +that can be controlled via text commands. Webpage: https://evm7.github.io/ECHO/ + +
+
+ comment: Accepted at ICRA 2024. Webpage: https://evm7.github.io/ECHO/ +
+
+
+
+
+ + ♻ ☆ HiCRISP: An LLM-based Hierarchical Closed-Loop Robotic Intelligent + Self-Correction Planner + + +
+ The integration of Large Language Models (LLMs) into robotics has +revolutionized human-robot interactions and autonomous task planning. However, +these systems are often unable to self-correct during the task execution, which +hinders their adaptability in dynamic real-world environments. To address this +issue, we present a Hierarchical Closed-loop Robotic Intelligent +Self-correction Planner (HiCRISP), an innovative framework that enables robots +to correct errors within individual steps during the task execution. HiCRISP +actively monitors and adapts the task execution process, addressing both +high-level planning and low-level action errors. Extensive benchmark +experiments, encompassing virtual and real-world scenarios, showcase HiCRISP's +exceptional performance, positioning it as a promising solution for robotic +task planning with LLMs. + +
+
+
+
+
+ + ♻ ☆ Stretchable Pneumatic Sleeve for Adaptable, Low-Displacement Anchoring + in Exosuits + + +
+ Despite recent advances in wearable technology, interfacing movement +assistance devices with the human body remains challenging. We present a +stretchable pneumatic sleeve that can anchor an exosuit actuator to the human +arm with a low displacement of the actuator's mounting point relative to the +body during operation. Our sleeve has the potential to serve as an adaptable +attachment mechanism for exosuits, since it can adjust its pressure to only +compress the arm as much as needed to transmit the applied exosuit forces +without a large displacement. We discuss the design of our sleeve, which is +made of fabric pneumatic artificial muscle (fPAM) actuators formed into bands. +We quantify the performance of nine fPAM bands of various lengths and widths, +as well as three sleeves (an fPAM sleeve, a series pouch motor (SPM) sleeve as +in previous literature, and an off the shelf hook and loop sleeve), through the +measurement of the compressing force as a function of pressure and the +localized pulling force that can be resisted as a function of both pressure and +mounting point displacement. Our experimental results show that fPAM bands with +smaller resting length and/or larger resting width produce higher forces. Also, +when inflated, an fPAM sleeve that has equivalent dimensions to the SPM sleeve +while fully stretched has similar performance to the SPM sleeve. While +inflated, both pneumatic sleeves decrease the mounting point displacement +compared to the hook and loop sleeve. Compared to the SPM sleeve, the fPAM +sleeve is able to hold larger internal pressure before bursting, increasing its +possible force range. Also, when not inflated, the fPAM sleeve resists the +pulling force well, indicating its ability to provide anchoring when not +actuated. + +
+
+ comment: 7th IEEE-RAS International Conference on Soft Robotics (RoboSoft + 2024) Supplementary video: https://youtu.be/9orz3NzMXT4?si=ZCjG72tS_2rSeFhJ +
+
+
+
+
+ + ♻ ☆ Unifying Foundation Models with Quadrotor Control for Visual Tracking + Beyond Object Categories + + +
+ Visual control enables quadrotors to adaptively navigate using real-time +sensory data, bridging perception with action. Yet, challenges persist, +including generalization across scenarios, maintaining reliability, and +ensuring real-time responsiveness. This paper introduces a perception framework +grounded in foundation models for universal object detection and tracking, +moving beyond specific training categories. Integral to our approach is a +multi-layered tracker integrated with the foundation detector, ensuring +continuous target visibility, even when faced with motion blur, abrupt light +shifts, and occlusions. Complementing this, we introduce a model-free +controller tailored for resilient quadrotor visual tracking. Our system +operates efficiently on limited hardware, relying solely on an onboard camera +and an inertial measurement unit. Through extensive validation in diverse +challenging indoor and outdoor environments, we demonstrate our system's +effectiveness and adaptability. In conclusion, our research represents a step +forward in quadrotor visual tracking, moving from task-specific methods to more +versatile and adaptable operations. + +
+
+
+
+
+ + ♻ ☆ Scaling Population-Based Reinforcement Learning with GPU Accelerated + Simulation RA-L + + +
+ In recent years, deep reinforcement learning (RL) has shown its effectiveness +in solving complex continuous control tasks like locomotion and dexterous +manipulation. However, this comes at the cost of an enormous amount of +experience required for training, exacerbated by the sensitivity of learning +efficiency and the policy performance to hyperparameter selection, which often +requires numerous trials of time-consuming experiments. This work introduces a +Population-Based Reinforcement Learning (PBRL) approach that exploits a +GPU-accelerated physics simulator to enhance the exploration capabilities of RL +by concurrently training multiple policies in parallel. The PBRL framework is +applied to three state-of-the-art RL algorithms -- PPO, SAC, and DDPG -- +dynamically adjusting hyperparameters based on the performance of learning +agents. The experiments are performed on four challenging tasks in Isaac Gym -- +Anymal Terrain, Shadow Hand, Humanoid, Franka Nut Pick -- by analyzing the +effect of population size and mutation mechanisms for hyperparameters. The +results show that PBRL agents achieve superior performance, in terms of +cumulative reward, compared to non-evolutionary baseline agents. The trained +agents are finally deployed in the real world for a Franka Nut Pick task, +demonstrating successful sim-to-real transfer. Code and videos of the learned +policies are available on our project website. + +
+
+ comment: Submitted for publication to IEEE Robotics and Automation Letters + (RA-L) +
+
+
+
+
+ + ♻ ☆ GMMCalib: Extrinsic Calibration of LiDAR Sensors using GMM-based Joint + Registration + + +
+ State-of-the-art LiDAR calibration frameworks mainly use non-probabilistic +registration methods such as Iterative Closest Point (ICP) and its variants. +These methods suffer from biased results due to their pair-wise registration +procedure as well as their sensitivity to initialization and parameterization. +This often leads to misalignments in the calibration process. Probabilistic +registration methods compensate for these drawbacks by specifically modeling +the probabilistic nature of the observations. This paper presents GMMCalib, an +automatic target-based extrinsic calibration approach for multi-LiDAR systems. +Using an implementation of a Gaussian Mixture Model (GMM)-based registration +method that allows joint registration of multiple point clouds, this +data-driven approach is compared to ICP algorithms. We perform simulation +experiments using the digital twin of the EDGAR research vehicle and validate +the results in a real-world environment. We also address the local minima +problem of local registration methods for extrinsic sensor calibration and use +a distance-based metric to evaluate the calibration results. Our results show +that an increase in robustness against sensor miscalibrations can be achieved +by using GMM-based registration algorithms. The code is open source and +available on GitHub. + +
+
+
+
+
+ + ♻ ☆ Online Elasticity Estimation and Material Sorting Using Standard Robot + Grippers + + +
+ Standard robot grippers are not designed for material recognition. We +experimentally evaluated the accuracy with which material properties can be +estimated through object compression by two standard parallel jaw grippers and +a force/torque sensor mounted at the robot wrist, with a professional biaxial +compression device used as reference. Gripper effort versus position curves +were obtained and transformed into stress/strain curves. The modulus of +elasticity was estimated at different strain points and the effect of multiple +compression cycles (precycling), compression speed, and the gripper surface +area on estimation was studied. Viscoelasticity was estimated using the energy +absorbed in a compression/decompression cycle, the Kelvin-Voigt, and +Hunt-Crossley models. We found that: (1) slower compression speeds improved +elasticity estimation, while precycling or surface area did not; (2) the robot +grippers, even after calibration, were found to have a limited capability of +delivering accurate estimates of absolute values of Young's modulus and +viscoelasticity; (3) relative ordering of material characteristics was largely +consistent across different grippers; (4) despite the nonlinear characteristics +of deformable objects, fitting linear stress/strain approximations led to more +stable results than local estimates of Young's modulus; (5) the Hunt-Crossley +model worked best to estimate viscoelasticity, from a single object +compression. A two-dimensional space formed by elasticity and viscoelasticity +estimates obtained from a single grasp is advantageous for the discrimination +of the object material properties. We demonstrated the applicability of our +findings in a mock single stream recycling scenario, where plastic, paper, and +metal objects were correctly separated from a single grasp, even when +compressed at different locations on the object. The data and code are publicly +available. + +
+
+ comment: 22 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ High-Frequency Capacitive Sensing for Electrohydraulic Soft Actuators + + +
+ The need for compliant and proprioceptive actuators has grown more evident in +pursuing more adaptable and versatile robotic systems. Hydraulically Amplified +Self-Healing Electrostatic (HASEL) actuators offer distinctive advantages with +their inherent softness and flexibility, making them promising candidates for +various robotic tasks, including delicate interactions with humans and animals, +biomimetic locomotion, prosthetics, and exoskeletons. This has resulted in a +growing interest in the capacitive self-sensing capabilities of HASEL actuators +to create miniature displacement estimation circuitry that does not require +external sensors. However, achieving HASEL self-sensing for actuation +frequencies above 1 Hz and with miniature high-voltage power supplies has +remained limited. In this paper, we introduce the F-HASEL actuator, which adds +an additional electrode pair used exclusively for capacitive sensing to a +Peano-HASEL actuator. We demonstrate displacement estimation of the F-HASEL +during high-frequency actuation up to 20 Hz and during external loading using +miniaturized circuitry comprised of low-cost off-the-shelf components and a +miniature high-voltage power supply. Finally, we propose a circuitry to +estimate the displacement of multiple F-HASELs and demonstrate it in a wearable +application to track joint rotations of a virtual reality user in real-time. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Multi-AGV Path Planning Method via Reinforcement Learning and Particle + Filters + + +
+ The Reinforcement Learning (RL) algorithm, renowned for its robust learning +capability and search stability, has garnered significant attention and found +extensive application in Automated Guided Vehicle (AGV) path planning. However, +RL planning algorithms encounter challenges stemming from the substantial +variance of neural networks caused by environmental instability and significant +fluctuations in system structure. These challenges manifest in slow convergence +speed and low learning efficiency. To tackle this issue, this paper presents +the Particle Filter-Double Deep Q-Network (PF-DDQN) approach, which +incorporates the Particle Filter (PF) into multi-AGV reinforcement learning +path planning. The PF-DDQN method leverages the imprecise weight values of the +network as state values to formulate the state space equation. Through the +iterative fusion process of neural networks and particle filters, the DDQN +model is optimized to acquire the optimal true weight values, thus enhancing +the algorithm's efficiency. The proposed method's effectiveness and superiority +are validated through numerical simulations. Overall, the simulation results +demonstrate that the proposed algorithm surpasses the traditional DDQN +algorithm in terms of path planning superiority and training time indicators by +92.62% and 76.88%, respectively. In conclusion, the PF-DDQN method addresses +the challenges encountered by RL planning algorithms in AGV path planning. By +integrating the Particle Filter and optimizing the DDQN model, the proposed +method achieves enhanced efficiency and outperforms the traditional DDQN +algorithm in terms of path planning superiority and training time indicators. + +
+
+ comment: The literature cited in the third article is not marked +
+
+
+
+
+ + ♻ ☆ Fast Biconnectivity Restoration in Multi-Robot Systems for Robust + Communication Maintenance + + +
+ Maintaining a robust communication network plays an important role in the +success of a multi-robot team jointly performing an optimization task. A key +characteristic of a robust multi-robot system is the ability to repair the +communication topology itself in the case of robot failure. In this paper, we +focus on the Fast Biconnectivity Restoration (FBR) problem, which aims to +repair a connected network to make it biconnected as fast as possible, where a +biconnected network is a communication topology that cannot be disconnected by +removing one node. We develop a Quadratically Constrained Program (QCP) +formulation of the FBR problem, which provides a way to optimally solve the +problem. We also propose an approximation algorithm for the FBR problem based +on graph theory. By conducting empirical studies, we demonstrate that our +proposed approximation algorithm performs close to the optimal while +significantly outperforming the existing solutions. + +
+
+ comment: updated author affiliation, fixed typos, added references +
+
+
+
+
+ + ♻ ☆ D2M2N: Decentralized Differentiable Memory-Enabled Mapping and + Navigation for Multiple Robots + + +
+ Recently, a number of learning-based models have been proposed for +multi-robot navigation. However, these models lack memory and only rely on the +current observations of the robot to plan their actions. They are unable to +leverage past observations to plan better paths, especially in complex +environments. In this work, we propose a fully differentiable and decentralized +memory-enabled architecture for multi-robot navigation and mapping called +D2M2N. D2M2N maintains a compact representation of the environment to remember +past observations and uses Value Iteration Network for complex navigation. We +conduct extensive experiments to show that D2M2N significantly outperforms the +state-of-the-art model in complex mapping and navigation task. + +
+
+ comment: 7 pages, 5 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ MVSA-Net: Multi-View State-Action Recognition for Robust and Deployable + Trajectory Generation + + +
+ The learn-from-observation (LfO) paradigm is a human-inspired mode for a +robot to learn to perform a task simply by watching it being performed. LfO can +facilitate robot integration on factory floors by minimizing disruption and +reducing tedious programming. A key component of the LfO pipeline is a +transformation of the depth camera frames to the corresponding task state and +action pairs, which are then relayed to learning techniques such as imitation +or inverse reinforcement learning for understanding the task parameters. While +several existing computer vision models analyze videos for activity +recognition, SA-Net specifically targets robotic LfO from RGB-D data. However, +SA-Net and many other models analyze frame data captured from a single +viewpoint. Their analysis is therefore highly sensitive to occlusions of the +observed task, which are frequent in deployments. An obvious way of reducing +occlusions is to simultaneously observe the task from multiple viewpoints and +synchronously fuse the multiple streams in the model. Toward this, we present +multi-view SA-Net, which generalizes the SA-Net model to allow the perception +of multiple viewpoints of the task activity, integrate them, and better +recognize the state and action in each frame. Performance evaluations on two +distinct domains establish that MVSA-Net recognizes the state-action pairs +under occlusion more accurately compared to single-view MVSA-Net and other +baselines. Our ablation studies further evaluate its performance under +different ambient conditions and establish the contribution of the architecture +components. As such, MVSA-Net offers a significantly more robust and deployable +state-action trajectory generation compared to previous methods. + +
+
+ comment: Presented at Deployable AI Workshop at AAAI-2024 and 'Towards + Reliable and Deployable Learning-Based Robotic Systems' Workshop at CoRL2023 +
+
+
+
+
+ + ♻ ☆ 3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple + 3D Representations + + +
+ Imitation learning provides an efficient way to teach robots dexterous +skills; however, learning complex skills robustly and generalizablely usually +consumes large amounts of human demonstrations. To tackle this challenging +problem, we present 3D Diffusion Policy (DP3), a novel visual imitation +learning approach that incorporates the power of 3D visual representations into +diffusion policies, a class of conditional action generative models. The core +design of DP3 is the utilization of a compact 3D visual representation, +extracted from sparse point clouds with an efficient point encoder. In our +experiments involving 72 simulation tasks, DP3 successfully handles most tasks +with just 10 demonstrations and surpasses baselines with a 24.2% relative +improvement. In 4 real robot tasks, DP3 demonstrates precise control with a +high success rate of 85%, given only 40 demonstrations of each task, and shows +excellent generalization abilities in diverse aspects, including space, +viewpoint, appearance, and instance. Interestingly, in real robot experiments, +DP3 rarely violates safety requirements, in contrast to baseline methods which +frequently do, necessitating human intervention. Our extensive evaluation +highlights the critical importance of 3D representations in real-world robot +learning. Videos, code, and data are available on +https://3d-diffusion-policy.github.io . + +
+
+ comment: Videos, code, and data: https://3d-diffusion-policy.github.io +
+
+
+
+
+ + ♻ ☆ Breaking Symmetries Leads to Diverse Quadrupedal Gaits + + +
+ Symmetry manifests itself in legged locomotion in a variety of ways. No +matter where a legged system begins to move periodically, the torso and limbs +coordinate with each other's movements in a similar manner. Also, in many gaits +observed in nature, the legs on both sides of the torso move in exactly the +same way, sometimes they are just half a period out of phase. Furthermore, when +some animals move forward and backward, their movements are strikingly similar +as if the time had been reversed. This work aims to generalize these phenomena +and propose formal definitions of symmetries in legged locomotion using group +theory terminology. Symmetries in some common quadrupedal gaits such as +pronking, bounding, half-bounding, and galloping have been discussed. Moreover, +a spring-mass model has been used to demonstrate how breaking symmetries can +alter gaits in a legged system. Studying the symmetries may provide insight +into which gaits may be suitable for a particular robotic design, or may enable +roboticists to design more agile and efficient robot controllers by using +certain gaits. + +
+
+ comment: Please refer to the published version to cite this paper +
+
+
+
+
+ + ♻ ☆ Learning to Fly in Seconds RA-L + + +
+ Learning-based methods, particularly Reinforcement Learning (RL), hold great +promise for streamlining deployment, enhancing performance, and achieving +generalization in the control of autonomous multirotor aerial vehicles. Deep RL +has been able to control complex systems with impressive fidelity and agility +in simulation but the simulation-to-reality transfer often brings a +hard-to-bridge reality gap. Moreover, RL is commonly plagued by prohibitively +long training times. In this work, we propose a novel asymmetric +actor-critic-based architecture coupled with a highly reliable RL-based +training paradigm for end-to-end quadrotor control. We show how curriculum +learning and a highly optimized simulator enhance sample complexity and lead to +fast training times. To precisely discuss the challenges related to +low-level/end-to-end multirotor control, we also introduce a taxonomy that +classifies the existing levels of control abstractions as well as +non-linearities and domain parameters. Our framework enables +Simulation-to-Reality (Sim2Real) transfer for direct RPM control after only 18 +seconds of training on a consumer-grade laptop as well as its deployment on +microcontrollers to control a multirotor under real-time guarantees. Finally, +our solution exhibits competitive performance in trajectory tracking, as +demonstrated through various experimental comparisons with existing +state-of-the-art control solutions using a real Crazyflie nano quadrotor. We +open source the code including a very fast multirotor dynamics simulator that +can simulate about 5 months of flight per second on a laptop GPU. The fast +training times and deployment to a cheap, off-the-shelf quadrotor lower the +barriers to entry and help democratize the research and development of these +systems. + +
+
+ comment: Accepted for publication in IEEE Robotics and Automation Letters + (RA-L) +
+
+
+
+
+ + ♻ ☆ Embedded light-weight approach for safe landing in populated areas + + +
+ Landing safety is a challenge heavily engaging the research community +recently, due to the increasing interest in applications availed by aerial +vehicles. In this paper, we propose a landing safety pipeline based on state of +the art object detectors and OctoMap. First, a point cloud of surface obstacles +is generated, which is then inserted in an OctoMap. The unoccupied areas are +identified, thus resulting to a list of safe landing points. Due to the low +inference time achieved by state of the art object detectors and the efficient +point cloud manipulation using OctoMap, it is feasible for our approach to +deploy on low-weight embedded systems. The proposed pipeline has been evaluated +in many simulation scenarios, varying in people density, number, and movement. +Simulations were executed with an Nvidia Jetson Nano in the loop to confirm the +pipeline's performance and robustness in a low computing power hardware. The +experiments yielded promising results with a 95% success rate. + +
+
+ comment: outdated research item +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 147 + +
+
+
+ + ☆ Finding Visual Task Vectors + + +
+ Visual Prompting is a technique for teaching models to perform a visual task +via in-context examples, without any additional training. In this work, we +analyze the activations of MAE-VQGAN, a recent Visual Prompting model, and find +task vectors, activations that encode task-specific information. Equipped with +this insight, we demonstrate that it is possible to identify the task vectors +and use them to guide the network towards performing different tasks without +providing any input-output examples. To find task vectors, we compute the +average intermediate activations per task and use the REINFORCE algorithm to +search for the subset of task vectors. The resulting task vectors guide the +model towards performing a task better than the original model without the need +for input-output examples. + +
+
+ comment: https://github.com/alhojel/visual_task_vectors +
+
+
+
+
+ + ☆ MA-LMM: Memory-Augmented Large Multimodal Model for Long-Term Video + Understanding CVPR 2024 + + +
+ With the success of large language models (LLMs), integrating the vision +model into LLMs to build vision-language foundation models has gained much more +interest recently. However, existing LLM-based large multimodal models (e.g., +Video-LLaMA, VideoChat) can only take in a limited number of frames for short +video understanding. In this study, we mainly focus on designing an efficient +and effective model for long-term video understanding. Instead of trying to +process more frames simultaneously like most existing work, we propose to +process videos in an online manner and store past video information in a memory +bank. This allows our model to reference historical video content for long-term +analysis without exceeding LLMs' context length constraints or GPU memory +limits. Our memory bank can be seamlessly integrated into current multimodal +LLMs in an off-the-shelf manner. We conduct extensive experiments on various +video understanding tasks, such as long-video understanding, video question +answering, and video captioning, and our model can achieve state-of-the-art +performances across multiple datasets. Code available at +https://boheumd.github.io/MA-LMM/. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Ferret-UI: Grounded Mobile UI Understanding with Multimodal LLMs + + +
+ Recent advancements in multimodal large language models (MLLMs) have been +noteworthy, yet, these general-domain MLLMs often fall short in their ability +to comprehend and interact effectively with user interface (UI) screens. In +this paper, we present Ferret-UI, a new MLLM tailored for enhanced +understanding of mobile UI screens, equipped with referring, grounding, and +reasoning capabilities. Given that UI screens typically exhibit a more +elongated aspect ratio and contain smaller objects of interest (e.g., icons, +texts) than natural images, we incorporate "any resolution" on top of Ferret to +magnify details and leverage enhanced visual features. Specifically, each +screen is divided into 2 sub-images based on the original aspect ratio (i.e., +horizontal division for portrait screens and vertical division for landscape +screens). Both sub-images are encoded separately before being sent to LLMs. We +meticulously gather training samples from an extensive range of elementary UI +tasks, such as icon recognition, find text, and widget listing. These samples +are formatted for instruction-following with region annotations to facilitate +precise referring and grounding. To augment the model's reasoning ability, we +further compile a dataset for advanced tasks, including detailed description, +perception/interaction conversations, and function inference. After training on +the curated datasets, Ferret-UI exhibits outstanding comprehension of UI +screens and the capability to execute open-ended instructions. For model +evaluation, we establish a comprehensive benchmark encompassing all the +aforementioned tasks. Ferret-UI excels not only beyond most open-source UI +MLLMs, but also surpasses GPT-4V on all the elementary UI tasks. + +
+
+
+
+
+ + ☆ SwapAnything: Enabling Arbitrary Object Swapping in Personalized Visual + Editing + + +
+ Effective editing of personal content holds a pivotal role in enabling +individuals to express their creativity, weaving captivating narratives within +their visual stories, and elevate the overall quality and impact of their +visual content. Therefore, in this work, we introduce SwapAnything, a novel +framework that can swap any objects in an image with personalized concepts +given by the reference, while keeping the context unchanged. Compared with +existing methods for personalized subject swapping, SwapAnything has three +unique advantages: (1) precise control of arbitrary objects and parts rather +than the main subject, (2) more faithful preservation of context pixels, (3) +better adaptation of the personalized concept to the image. First, we propose +targeted variable swapping to apply region control over latent feature maps and +swap masked variables for faithful context preservation and initial semantic +concept swapping. Then, we introduce appearance adaptation, to seamlessly adapt +the semantic concept into the original image in terms of target location, +shape, style, and content during the image generation process. Extensive +results on both human and automatic evaluation demonstrate significant +improvements of our approach over baseline methods on personalized swapping. +Furthermore, SwapAnything shows its precise and faithful swapping abilities +across single object, multiple objects, partial object, and cross-domain +swapping tasks. SwapAnything also achieves great performance on text-based +swapping and tasks beyond swapping such as object insertion. + +
+
+ comment: 18 pages, 16 figures, 3 tables +
+
+
+
+
+ + ☆ Learning 3D-Aware GANs from Unposed Images with Template Feature Field + + +
+ Collecting accurate camera poses of training images has been shown to well +serve the learning of 3D-aware generative adversarial networks (GANs) yet can +be quite expensive in practice. This work targets learning 3D-aware GANs from +unposed images, for which we propose to perform on-the-fly pose estimation of +training images with a learned template feature field (TeFF). Concretely, in +addition to a generative radiance field as in previous approaches, we ask the +generator to also learn a field from 2D semantic features while sharing the +density from the radiance field. Such a framework allows us to acquire a +canonical 3D feature template leveraging the dataset mean discovered by the +generative model, and further efficiently estimate the pose parameters on real +data. Experimental results on various challenging datasets demonstrate the +superiority of our approach over state-of-the-art alternatives from both the +qualitative and the quantitative perspectives. + +
+
+ comment: https://XDimlab.github.io/TeFF +
+
+
+
+
+ + ☆ Evaluating the Efficacy of Cut-and-Paste Data Augmentation in Semantic + Segmentation for Satellite Imagery RSS 2024 + + +
+ Satellite imagery is crucial for tasks like environmental monitoring and +urban planning. Typically, it relies on semantic segmentation or Land Use Land +Cover (LULC) classification to categorize each pixel. Despite the advancements +brought about by Deep Neural Networks (DNNs), their performance in segmentation +tasks is hindered by challenges such as limited availability of labeled data, +class imbalance and the inherent variability and complexity of satellite +images. In order to mitigate those issues, our study explores the effectiveness +of a Cut-and-Paste augmentation technique for semantic segmentation in +satellite images. We adapt this augmentation, which usually requires labeled +instances, to the case of semantic segmentation. By leveraging the connected +components in the semantic segmentation labels, we extract instances that are +then randomly pasted during training. Using the DynamicEarthNet dataset and a +U-Net model for evaluation, we found that this augmentation significantly +enhances the mIoU score on the test set from 37.9 to 44.1. This finding +highlights the potential of the Cut-and-Paste augmentation to improve the +generalization capabilities of semantic segmentation models in satellite +imagery. + +
+
+ comment: Accepted for publication in IEEE 2024 International Geoscience & + Remote Sensing Symposium (IGARSS 2024) +
+
+
+
+
+ + ☆ Retrieval-Augmented Open-Vocabulary Object Detection CVPR 2024 + + +
+ Open-vocabulary object detection (OVD) has been studied with Vision-Language +Models (VLMs) to detect novel objects beyond the pre-trained categories. +Previous approaches improve the generalization ability to expand the knowledge +of the detector, using 'positive' pseudo-labels with additional 'class' names, +e.g., sock, iPod, and alligator. To extend the previous methods in two aspects, +we propose Retrieval-Augmented Losses and visual Features (RALF). Our method +retrieves related 'negative' classes and augments loss functions. Also, visual +features are augmented with 'verbalized concepts' of classes, e.g., worn on the +feet, handheld music player, and sharp teeth. Specifically, RALF consists of +two modules: Retrieval Augmented Losses (RAL) and Retrieval-Augmented visual +Features (RAF). RAL constitutes two losses reflecting the semantic similarity +with negative vocabularies. In addition, RAF augments visual features with the +verbalized concepts from a large language model (LLM). Our experiments +demonstrate the effectiveness of RALF on COCO and LVIS benchmark datasets. We +achieve improvement up to 3.4 box AP$_{50}^{\text{N}}$ on novel categories of +the COCO dataset and 3.6 mask AP$_{\text{r}}$ gains on the LVIS dataset. Code +is available at https://github.com/mlvlab/RALF . + +
+
+ comment: Accepted paper at CVPR 2024 +
+
+
+
+
+ + ☆ SphereHead: Stable 3D Full-head Synthesis with Spherical Tri-plane + Representation + + +
+ While recent advances in 3D-aware Generative Adversarial Networks (GANs) have +aided the development of near-frontal view human face synthesis, the challenge +of comprehensively synthesizing a full 3D head viewable from all angles still +persists. Although PanoHead proves the possibilities of using a large-scale +dataset with images of both frontal and back views for full-head synthesis, it +often causes artifacts for back views. Based on our in-depth analysis, we found +the reasons are mainly twofold. First, from network architecture perspective, +we found each plane in the utilized tri-plane/tri-grid representation space +tends to confuse the features from both sides, causing "mirroring" artifacts +(e.g., the glasses appear in the back). Second, from data supervision aspect, +we found that existing discriminator training in 3D GANs mainly focuses on the +quality of the rendered image itself, and does not care much about its +plausibility with the perspective from which it was rendered. This makes it +possible to generate "face" in non-frontal views, due to its easiness to fool +the discriminator. In response, we propose SphereHead, a novel tri-plane +representation in the spherical coordinate system that fits the human head's +geometric characteristics and efficiently mitigates many of the generated +artifacts. We further introduce a view-image consistency loss for the +discriminator to emphasize the correspondence of the camera parameters and the +images. The combination of these efforts results in visually superior outcomes +with significantly fewer artifacts. Our code and dataset are publicly available +at https://lhyfst.github.io/spherehead. + +
+
+ comment: project page: https://lhyfst.github.io/spherehead +
+
+
+
+
+ + ☆ Normalizing Flows on the Product Space of SO(3) Manifolds for + Probabilistic Human Pose Modeling CVPR 2024 + + +
+ Normalizing flows have proven their efficacy for density estimation in +Euclidean space, but their application to rotational representations, crucial +in various domains such as robotics or human pose modeling, remains +underexplored. Probabilistic models of the human pose can benefit from +approaches that rigorously consider the rotational nature of human joints. For +this purpose, we introduce HuProSO3, a normalizing flow model that operates on +a high-dimensional product space of SO(3) manifolds, modeling the joint +distribution for human joints with three degrees of freedom. HuProSO3's +advantage over state-of-the-art approaches is demonstrated through its superior +modeling accuracy in three different applications and its capability to +evaluate the exact likelihood. This work not only addresses the technical +challenge of learning densities on SO(3) manifolds, but it also has broader +implications for domains where the probabilistic regression of correlated 3D +rotations is of importance. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ MoMA: Multimodal LLM Adapter for Fast Personalized Image Generation + + +
+ In this paper, we present MoMA: an open-vocabulary, training-free +personalized image model that boasts flexible zero-shot capabilities. As +foundational text-to-image models rapidly evolve, the demand for robust +image-to-image translation grows. Addressing this need, MoMA specializes in +subject-driven personalized image generation. Utilizing an open-source, +Multimodal Large Language Model (MLLM), we train MoMA to serve a dual role as +both a feature extractor and a generator. This approach effectively synergizes +reference image and text prompt information to produce valuable image features, +facilitating an image diffusion model. To better leverage the generated +features, we further introduce a novel self-attention shortcut method that +efficiently transfers image features to an image diffusion model, improving the +resemblance of the target object in generated images. Remarkably, as a +tuning-free plug-and-play module, our model requires only a single reference +image and outperforms existing methods in generating images with high detail +fidelity, enhanced identity-preservation and prompt faithfulness. Our work is +open-source, thereby providing universal access to these advancements. + +
+
+
+
+
+ + ☆ CoReS: Orchestrating the Dance of Reasoning and Segmentation + + +
+ The reasoning segmentation task, which demands a nuanced comprehension of +intricate queries to accurately pinpoint object regions, is attracting +increasing attention. However, Multi-modal Large Language Models (MLLM) often +find it difficult to accurately localize the objects described in complex +reasoning contexts. We believe that the act of reasoning segmentation should +mirror the cognitive stages of human visual search, where each step is a +progressive refinement of thought toward the final object. Thus we introduce +the Chains of Reasoning and Segmenting (CoReS) and find this top-down visual +hierarchy indeed enhances the visual search process. Specifically, we propose a +dual-chain structure that generates multi-modal, chain-like outputs to aid the +segmentation process. Furthermore, to steer the MLLM's outputs into this +intended hierarchy, we incorporate in-context inputs as guidance. Extensive +experiments demonstrate the superior performance of our CoReS, which surpasses +the state-of-the-art method by 7.1\% on the ReasonSeg dataset. The code will be +released at https://github.com/baoxiaoyi/CoReS. + +
+
+
+
+
+ + ☆ NAF-DPM: A Nonlinear Activation-Free Diffusion Probabilistic Model for + Document Enhancement + + +
+ Real-world documents may suffer various forms of degradation, often resulting +in lower accuracy in optical character recognition (OCR) systems. Therefore, a +crucial preprocessing step is essential to eliminate noise while preserving +text and key features of documents. In this paper, we propose NAF-DPM, a novel +generative framework based on a diffusion probabilistic model (DPM) designed to +restore the original quality of degraded documents. While DPMs are recognized +for their high-quality generated images, they are also known for their large +inference time. To mitigate this problem we provide the DPM with an efficient +nonlinear activation-free (NAF) network and we employ as a sampler a fast +solver of ordinary differential equations, which can converge in a few +iterations. To better preserve text characters, we introduce an additional +differentiable module based on convolutional recurrent neural networks, +simulating the behavior of an OCR system during training. Experiments conducted +on various datasets showcase the superiority of our approach, achieving +state-of-the-art performance in terms of pixel-level and perceptual similarity +metrics. Furthermore, the results demonstrate a notable character error +reduction made by OCR systems when transcribing real-world document images +enhanced by our framework. Code and pre-trained models are available at +https://github.com/ispamm/NAF-DPM. + +
+
+ comment: Under review at IEEE Transactions on Pattern Analysis and Machine + Intelligence +
+
+
+
+
+ + ☆ AlignZeg: Mitigating Objective Misalignment for Zero-shot Semantic + Segmentation + + +
+ A serious issue that harms the performance of zero-shot visual recognition is +named objective misalignment, i.e., the learning objective prioritizes +improving the recognition accuracy of seen classes rather than unseen classes, +while the latter is the true target to pursue. This issue becomes more +significant in zero-shot image segmentation because the stronger (i.e., +pixel-level) supervision brings a larger gap between seen and unseen classes. +To mitigate it, we propose a novel architecture named AlignZeg, which embodies +a comprehensive improvement of the segmentation pipeline, including proposal +extraction, classification, and correction, to better fit the goal of zero-shot +segmentation. (1) Mutually-Refined Proposal Extraction. AlignZeg harnesses a +mutual interaction between mask queries and visual features, facilitating +detailed class-agnostic mask proposal extraction. (2) Generalization-Enhanced +Proposal Classification. AlignZeg introduces synthetic data and incorporates +multiple background prototypes to allocate a more generalizable feature space. +(3) Predictive Bias Correction. During the inference stage, AlignZeg uses a +class indicator to find potential unseen class proposals followed by a +prediction postprocess to correct the prediction bias. Experiments demonstrate +that AlignZeg markedly enhances zero-shot semantic segmentation, as shown by an +average 3.8% increase in hIoU, primarily attributed to a 7.1% improvement in +identifying unseen classes, and we further validate that the improvement comes +from alleviating the objective misalignment issue. + +
+
+
+
+
+ + ☆ YaART: Yet Another ART Rendering Technology + + +
+ In the rapidly progressing field of generative models, the development of +efficient and high-fidelity text-to-image diffusion systems represents a +significant frontier. This study introduces YaART, a novel production-grade +text-to-image cascaded diffusion model aligned to human preferences using +Reinforcement Learning from Human Feedback (RLHF). During the development of +YaART, we especially focus on the choices of the model and training dataset +sizes, the aspects that were not systematically investigated for text-to-image +cascaded diffusion models before. In particular, we comprehensively analyze how +these choices affect both the efficiency of the training process and the +quality of the generated images, which are highly important in practice. +Furthermore, we demonstrate that models trained on smaller datasets of +higher-quality images can successfully compete with those trained on larger +datasets, establishing a more efficient scenario of diffusion models training. +From the quality perspective, YaART is consistently preferred by users over +many existing state-of-the-art models. + +
+
+ comment: Prompts and additional information are available on the project page, + see https://ya.ru/ai/art/paper-yaart-v1 +
+
+
+
+
+ + ☆ BinaryDM: Towards Accurate Binarization of Diffusion Model + + +
+ With the advancement of diffusion models (DMs) and the substantially +increased computational requirements, quantization emerges as a practical +solution to obtain compact and efficient low-bit DMs. However, the highly +discrete representation leads to severe accuracy degradation, hindering the +quantization of diffusion models to ultra-low bit-widths. In this paper, we +propose BinaryDM, a novel accurate quantization-aware training approach to push +the weights of diffusion models towards the limit of 1-bit. Firstly, we present +a Learnable Multi-basis Binarizer (LMB) to recover the representations +generated by the binarized DM, which improves the information in details of +representations crucial to the DM. Secondly, a Low-rank Representation +Mimicking (LRM) is applied to enhance the binarization-aware optimization of +the DM, alleviating the optimization direction ambiguity caused by fine-grained +alignment. Moreover, a progressive initialization strategy is applied to +training DMs to avoid convergence difficulties. Comprehensive experiments +demonstrate that BinaryDM achieves significant accuracy and efficiency gains +compared to SOTA quantization methods of DMs under ultra-low bit-widths. As the +first binarization method for diffusion models, BinaryDM achieves impressive +16.0 times FLOPs and 27.1 times storage savings with 1-bit weight and 4-bit +activation, showcasing its substantial advantages and potential for deploying +DMs on resource-limited scenarios. + +
+
+ comment: The code will soon be available at + https://github.com/Xingyu-Zheng/BinaryDM +
+
+
+
+
+ + ☆ Automatic Controllable Colorization via Imagination CVPR 2024 + + +
+ We propose a framework for automatic colorization that allows for iterative +editing and modifications. The core of our framework lies in an imagination +module: by understanding the content within a grayscale image, we utilize a +pre-trained image generation model to generate multiple images that contain the +same content. These images serve as references for coloring, mimicking the +process of human experts. As the synthesized images can be imperfect or +different from the original grayscale image, we propose a Reference Refinement +Module to select the optimal reference composition. Unlike most previous +end-to-end automatic colorization algorithms, our framework allows for +iterative and localized modifications of the colorization results because we +explicitly model the coloring samples. Extensive experiments demonstrate the +superiority of our framework over existing automatic colorization algorithms in +editability and flexibility. Project page: +https://xy-cong.github.io/imagine-colorization. + +
+
+ comment: CVPR 2024. Project page: + https://xy-cong.github.io/imagine-colorization +
+
+
+
+
+ + ☆ MLP Can Be A Good Transformer Learner + + +
+ Self-attention mechanism is the key of the Transformer but often criticized +for its computation demands. Previous token pruning works motivate their +methods from the view of computation redundancy but still need to load the full +network and require same memory costs. This paper introduces a novel strategy +that simplifies vision transformers and reduces computational load through the +selective removal of non-essential attention layers, guided by entropy +considerations. We identify that regarding the attention layer in bottom +blocks, their subsequent MLP layers, i.e. two feed-forward layers, can elicit +the same entropy quantity. Meanwhile, the accompanied MLPs are under-exploited +since they exhibit smaller feature entropy compared to those MLPs in the top +blocks. Therefore, we propose to integrate the uninformative attention layers +into their subsequent counterparts by degenerating them into identical mapping, +yielding only MLP in certain transformer blocks. Experimental results on +ImageNet-1k show that the proposed method can remove 40% attention layer of +DeiT-B, improving throughput and memory bound without performance compromise. +Code is available at https://github.com/sihaoevery/lambda_vit. + +
+
+ comment: efficient transformer +
+
+
+
+
+ + ☆ 3D-COCO: extension of MS-COCO dataset for image detection and 3D + reconstruction modules + + +
+ We introduce 3D-COCO, an extension of the original MS-COCO dataset providing +3D models and 2D-3D alignment annotations. 3D-COCO was designed to achieve +computer vision tasks such as 3D reconstruction or image detection configurable +with textual, 2D image, and 3D CAD model queries. We complete the existing +MS-COCO dataset with 28K 3D models collected on ShapeNet and Objaverse. By +using an IoU-based method, we match each MS-COCO annotation with the best 3D +models to provide a 2D-3D alignment. The open-source nature of 3D-COCO is a +premiere that should pave the way for new research on 3D-related topics. The +dataset and its source codes is available at +https://kalisteo.cea.fr/index.php/coco3d-object-detection-and-reconstruction/ + +
+
+
+
+
+ + ☆ Learning a Category-level Object Pose Estimator without Pose Annotations + + +
+ 3D object pose estimation is a challenging task. Previous works always +require thousands of object images with annotated poses for learning the 3D +pose correspondence, which is laborious and time-consuming for labeling. In +this paper, we propose to learn a category-level 3D object pose estimator +without pose annotations. Instead of using manually annotated images, we +leverage diffusion models (e.g., Zero-1-to-3) to generate a set of images under +controlled pose differences and propose to learn our object pose estimator with +those images. Directly using the original diffusion model leads to images with +noisy poses and artifacts. To tackle this issue, firstly, we exploit an image +encoder, which is learned from a specially designed contrastive pose learning, +to filter the unreasonable details and extract image feature maps. +Additionally, we propose a novel learning strategy that allows the model to +learn object poses from those generated image sets without knowing the +alignment of their canonical poses. Experimental results show that our method +has the capability of category-level object pose estimation from a single shot +setting (as pose definition), while significantly outperforming other +state-of-the-art methods on the few-shot category-level object pose estimation +benchmarks. + +
+
+
+
+
+ + ☆ MULTIFLOW: Shifting Towards Task-Agnostic Vision-Language Pruning CVPR 2024 + + +
+ While excellent in transfer learning, Vision-Language models (VLMs) come with +high computational costs due to their large number of parameters. To address +this issue, removing parameters via model pruning is a viable solution. +However, existing techniques for VLMs are task-specific, and thus require +pruning the network from scratch for each new task of interest. In this work, +we explore a new direction: Task-Agnostic Vision-Language Pruning (TA-VLP). +Given a pretrained VLM, the goal is to find a unique pruned counterpart +transferable to multiple unknown downstream tasks. In this challenging setting, +the transferable representations already encoded in the pretrained model are a +key aspect to preserve. Thus, we propose Multimodal Flow Pruning (MULTIFLOW), a +first, gradient-free, pruning framework for TA-VLP where: (i) the importance of +a parameter is expressed in terms of its magnitude and its information flow, by +incorporating the saliency of the neurons it connects; and (ii) pruning is +driven by the emergent (multimodal) distribution of the VLM parameters after +pretraining. We benchmark eight state-of-the-art pruning algorithms in the +context of TA-VLP, experimenting with two VLMs, three vision-language tasks, +and three pruning ratios. Our experimental results show that MULTIFLOW +outperforms recent sophisticated, combinatorial competitors in the vast +majority of the cases, paving the way towards addressing TA-VLP. The code is +publicly available at https://github.com/FarinaMatteo/multiflow. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ A Training-Free Plug-and-Play Watermark Framework for Stable Diffusion + + +
+ Nowadays, the family of Stable Diffusion (SD) models has gained prominence +for its high quality outputs and scalability. This has also raised security +concerns on social media, as malicious users can create and disseminate harmful +content. Existing approaches involve training components or entire SDs to embed +a watermark in generated images for traceability and responsibility +attribution. However, in the era of AI-generated content (AIGC), the rapid +iteration of SDs renders retraining with watermark models costly. To address +this, we propose a training-free plug-and-play watermark framework for SDs. +Without modifying any components of SDs, we embed diverse watermarks in the +latent space, adapting to the denoising process. Our experimental findings +reveal that our method effectively harmonizes image quality and watermark +invisibility. Furthermore, it performs robustly under various attacks. We also +have validated that our method is generalized to multiple versions of SDs, even +without retraining the watermark model. + +
+
+
+
+
+ + ☆ Learning Topology Uniformed Face Mesh by Volume Rendering for Multi-view + Reconstruction + + +
+ Face meshes in consistent topology serve as the foundation for many +face-related applications, such as 3DMM constrained face reconstruction and +expression retargeting. Traditional methods commonly acquire topology uniformed +face meshes by two separate steps: multi-view stereo (MVS) to reconstruct +shapes followed by non-rigid registration to align topology, but struggles with +handling noise and non-lambertian surfaces. Recently neural volume rendering +techniques have been rapidly evolved and shown great advantages in 3D +reconstruction or novel view synthesis. Our goal is to leverage the superiority +of neural volume rendering into multi-view reconstruction of face mesh with +consistent topology. We propose a mesh volume rendering method that enables +directly optimizing mesh geometry while preserving topology, and learning +implicit features to model complex facial appearance from multi-view images. +The key innovation lies in spreading sparse mesh features into the surrounding +space to simulate radiance field required for volume rendering, which +facilitates backpropagation of gradients from images to mesh geometry and +implicit appearance features. Our proposed feature spreading module exhibits +deformation invariance, enabling photorealistic rendering seamlessly after mesh +editing. We conduct experiments on multi-view face image dataset to evaluate +the reconstruction and implement an application for photorealistic rendering of +animated face mesh. + +
+
+
+
+
+ + ☆ Self-Explainable Affordance Learning with Embodied Caption + + +
+ In the field of visual affordance learning, previous methods mainly used +abundant images or videos that delineate human behavior patterns to identify +action possibility regions for object manipulation, with a variety of +applications in robotic tasks. However, they encounter a main challenge of +action ambiguity, illustrated by the vagueness like whether to beat or carry a +drum, and the complexities involved in processing intricate scenes. Moreover, +it is important for human intervention to rectify robot errors in time. To +address these issues, we introduce Self-Explainable Affordance learning (SEA) +with embodied caption. This innovation enables robots to articulate their +intentions and bridge the gap between explainable vision-language caption and +visual affordance learning. Due to a lack of appropriate dataset, we unveil a +pioneering dataset and metrics tailored for this task, which integrates images, +heatmaps, and embodied captions. Furthermore, we propose a novel model to +effectively combine affordance grounding with self-explanation in a simple but +efficient manner. Extensive quantitative and qualitative experiments +demonstrate our method's effectiveness. + +
+
+
+
+
+ + ☆ UniFL: Improve Stable Diffusion via Unified Feedback Learning + + +
+ Diffusion models have revolutionized the field of image generation, leading +to the proliferation of high-quality models and diverse downstream +applications. However, despite these significant advancements, the current +competitive solutions still suffer from several limitations, including inferior +visual quality, a lack of aesthetic appeal, and inefficient inference, without +a comprehensive solution in sight. To address these challenges, we present +UniFL, a unified framework that leverages feedback learning to enhance +diffusion models comprehensively. UniFL stands out as a universal, effective, +and generalizable solution applicable to various diffusion models, such as +SD1.5 and SDXL. Notably, UniFL incorporates three key components: perceptual +feedback learning, which enhances visual quality; decoupled feedback learning, +which improves aesthetic appeal; and adversarial feedback learning, which +optimizes inference speed. In-depth experiments and extensive user studies +validate the superior performance of our proposed method in enhancing both the +quality of generated models and their acceleration. For instance, UniFL +surpasses ImageReward by 17% user preference in terms of generation quality and +outperforms LCM and SDXL Turbo by 57% and 20% in 4-step inference. Moreover, we +have verified the efficacy of our approach in downstream tasks, including Lora, +ControlNet, and AnimateDiff. + +
+
+
+
+
+ + ☆ Neural Cellular Automata for Lightweight, Robust and Explainable + Classification of White Blood Cell Images + + +
+ Diagnosis of hematological malignancies depends on accurate identification of +white blood cells in peripheral blood smears. Deep learning techniques are +emerging as a viable solution to scale and optimize this process by automatic +identification of cells in laboratories. However, these techniques face several +challenges such as limited generalizability, sensitivity to domain shifts and +lack of explainability. Here, we are introducing a novel approach based on +neural cellular automata (NCA) for white blood cell classification. We test our +approach on three datasets of white blood cell images and show that we achieve +competitive performance compared to conventional methods. Our NCA-based method +is significantly smaller in terms of parameters and exhibits robustness to +domain shifts. Furthermore, the architecture is inherently explainable, +providing insights into the decision process for each classification, helping +experts understand and validate model predictions. Results demonstrate that NCA +not only can be used for image classification, but also address key challenges +of conventional methods, indicating a high potential for applicability in +clinical practice. + +
+
+
+
+
+ + ☆ Towards More General Video-based Deepfake Detection through Facial + Feature Guided Adaptation for Foundation Model + + +
+ With the rise of deep learning, generative models have enabled the creation +of highly realistic synthetic images, presenting challenges due to their +potential misuse. While research in Deepfake detection has grown rapidly in +response, many detection methods struggle with unseen Deepfakes generated by +new synthesis techniques. To address this generalisation challenge, we propose +a novel Deepfake detection approach by adapting rich information encoded inside +the Foundation Models with rich information encoded inside, specifically using +the image encoder from CLIP which has demonstrated strong zero-shot capability +for downstream tasks. Inspired by the recent advances of parameter efficient +fine-tuning, we propose a novel side-network-based decoder to extract spatial +and temporal cues from the given video clip, with the promotion of the Facial +Component Guidance (FCG) to guidencourage the spatial feature to include +features of key facial parts for more robust and general Deepfake detection. +Through extensive cross-dataset evaluations, our approach exhibits superior +effectiveness in identifying unseen Deepfake samples, achieving notable +performance improvementsuccess even with limited training samples and +manipulation types. Our model secures an average performance enhancement of +0.9% AUROC in cross-dataset assessments comparing with state-of-the-art +methods, especiallytablishing a significant lead of achieving 4.4% improvement +on the challenging DFDC dataset. + +
+
+
+
+
+ + ☆ Responsible Visual Editing + + +
+ With recent advancements in visual synthesis, there is a growing risk of +encountering images with detrimental effects, such as hate, discrimination, or +privacy violations. The research on transforming harmful images into +responsible ones remains unexplored. In this paper, we formulate a new task, +responsible visual editing, which entails modifying specific concepts within an +image to render it more responsible while minimizing changes. However, the +concept that needs to be edited is often abstract, making it challenging to +locate what needs to be modified and plan how to modify it. To tackle these +challenges, we propose a Cognitive Editor (CoEditor) that harnesses the large +multimodal model through a two-stage cognitive process: (1) a perceptual +cognitive process to focus on what needs to be modified and (2) a behavioral +cognitive process to strategize how to modify. To mitigate the negative +implications of harmful images on research, we create a transparent and public +dataset, AltBear, which expresses harmful information using teddy bears instead +of humans. Experiments demonstrate that CoEditor can effectively comprehend +abstract concepts within complex scenes and significantly surpass the +performance of baseline models for responsible visual editing. We find that the +AltBear dataset corresponds well to the harmful content found in real images, +offering a consistent experimental evaluation, thereby providing a safer +benchmark for future research. Moreover, CoEditor also shows great results in +general editing. We release our code and dataset at +https://github.com/kodenii/Responsible-Visual-Editing. + +
+
+ comment: 24 pages, 12 figures +
+
+
+
+
+ + ☆ Robust Data Pruning: Uncovering and Overcoming Implicit Bias + + +
+ In the era of exceptionally data-hungry models, careful selection of the +training data is essential to mitigate the extensive costs of deep learning. +Data pruning offers a solution by removing redundant or uninformative samples +from the dataset, which yields faster convergence and improved neural scaling +laws. However, little is known about its impact on classification bias of the +trained models. We conduct the first systematic study of this effect and reveal +that existing data pruning algorithms can produce highly biased classifiers. At +the same time, we argue that random data pruning with appropriate class ratios +has potential to improve the worst-class performance. We propose a +"fairness-aware" approach to pruning and empirically demonstrate its +performance on standard computer vision benchmarks. In sharp contrast to +existing algorithms, our proposed method continues improving robustness at a +tolerable drop of average performance as we prune more from the datasets. We +present theoretical analysis of the classification risk in a mixture of +Gaussians to further motivate our algorithm and support our findings. + +
+
+
+
+
+ + ☆ Social-MAE: Social Masked Autoencoder for Multi-person Motion + Representation Learning + + +
+ For a complete comprehension of multi-person scenes, it is essential to go +beyond basic tasks like detection and tracking. Higher-level tasks, such as +understanding the interactions and social activities among individuals, are +also crucial. Progress towards models that can fully understand scenes +involving multiple people is hindered by a lack of sufficient annotated data +for such high-level tasks. To address this challenge, we introduce Social-MAE, +a simple yet effective transformer-based masked autoencoder framework for +multi-person human motion data. The framework uses masked modeling to pre-train +the encoder to reconstruct masked human joint trajectories, enabling it to +learn generalizable and data efficient representations of motion in human +crowded scenes. Social-MAE comprises a transformer as the MAE encoder and a +lighter-weight transformer as the MAE decoder which operates on multi-person +joints' trajectory in the frequency domain. After the reconstruction task, the +MAE decoder is replaced with a task-specific decoder and the model is +fine-tuned end-to-end for a variety of high-level social tasks. Our proposed +model combined with our pre-training approach achieves the state-of-the-art +results on various high-level social tasks, including multi-person pose +forecasting, social grouping, and social action understanding. These +improvements are demonstrated across four popular multi-person datasets +encompassing both human 2D and 3D body pose. + +
+
+
+
+
+ + ☆ TIM: A Time Interval Machine for Audio-Visual Action Recognition CVPR 2024 + + +
+ Diverse actions give rise to rich audio-visual signals in long videos. Recent +works showcase that the two modalities of audio and video exhibit different +temporal extents of events and distinct labels. We address the interplay +between the two modalities in long videos by explicitly modelling the temporal +extents of audio and visual events. We propose the Time Interval Machine (TIM) +where a modality-specific time interval poses as a query to a transformer +encoder that ingests a long video input. The encoder then attends to the +specified interval, as well as the surrounding context in both modalities, in +order to recognise the ongoing action. + We test TIM on three long audio-visual video datasets: EPIC-KITCHENS, +Perception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On +EPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly +larger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we +show that TIM can be adapted for action detection, using dense multi-scale +interval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and +showing strong performance on the Perception Test. Our ablations show the +critical role of integrating the two modalities and modelling their time +intervals in achieving this performance. Code and models at: +https://github.com/JacobChalk/TIM + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Investigating the Effectiveness of Cross-Attention to Unlock Zero-Shot + Editing of Text-to-Video Diffusion Models CVPR 2024 + + +
+ With recent advances in image and video diffusion models for content +creation, a plethora of techniques have been proposed for customizing their +generated content. In particular, manipulating the cross-attention layers of +Text-to-Image (T2I) diffusion models has shown great promise in controlling the +shape and location of objects in the scene. Transferring image-editing +techniques to the video domain, however, is extremely challenging as object +motion and temporal consistency are difficult to capture accurately. In this +work, we take a first look at the role of cross-attention in Text-to-Video +(T2V) diffusion models for zero-shot video editing. While one-shot models have +shown potential in controlling motion and camera movement, we demonstrate +zero-shot control over object shape, position and movement in T2V models. We +show that despite the limitations of current T2V models, cross-attention +guidance can be a promising approach for editing videos. + +
+
+ comment: Generative Models for Computer Vision Generative Models for Computer + Vision CVPR 2024 Workshop +
+
+
+
+
+ + ☆ DepthMOT: Depth Cues Lead to a Strong Multi-Object Tracker + + +
+ Accurately distinguishing each object is a fundamental goal of Multi-object +tracking (MOT) algorithms. However, achieving this goal still remains +challenging, primarily due to: (i) For crowded scenes with occluded objects, +the high overlap of object bounding boxes leads to confusion among closely +located objects. Nevertheless, humans naturally perceive the depth of elements +in a scene when observing 2D videos. Inspired by this, even though the bounding +boxes of objects are close on the camera plane, we can differentiate them in +the depth dimension, thereby establishing a 3D perception of the objects. (ii) +For videos with rapidly irregular camera motion, abrupt changes in object +positions can result in ID switches. However, if the camera pose are known, we +can compensate for the errors in linear motion models. In this paper, we +propose \textit{DepthMOT}, which achieves: (i) detecting and estimating scene +depth map \textit{end-to-end}, (ii) compensating the irregular camera motion by +camera pose estimation. Extensive experiments demonstrate the superior +performance of DepthMOT in VisDrone-MOT and UAVDT datasets. The code will be +available at \url{https://github.com/JackWoo0831/DepthMOT}. + +
+
+
+
+
+ + ☆ Impact of LiDAR visualisations on semantic segmentation of + archaeological objects RSS 2024 + + +
+ Deep learning methods in LiDAR-based archaeological research often leverage +visualisation techniques derived from Digital Elevation Models to enhance +characteristics of archaeological objects present in the images. This paper +investigates the impact of visualisations on deep learning performance through +a comprehensive testing framework. The study involves the use of eight semantic +segmentation models to evaluate seven diverse visualisations across two study +areas, encompassing five archaeological classes. Experimental results reveal +that the choice of appropriate visualisations can influence performance by up +to 8%. Yet, pinpointing one visualisation that outperforms the others in +segmenting all archaeological classes proves challenging. The observed +performance variation, reaching up to 25% across different model +configurations, underscores the importance of thoughtfully selecting model +configurations and LiDAR visualisations for successfully segmenting +archaeological objects. + +
+
+ comment: Accepted to IEEE International Geoscience and Remote Sensing + Symposium 2024 (IGARSS 2024) @IEEE copyright +
+
+
+
+
+ + ☆ Taming Transformers for Realistic Lidar Point Cloud Generation + + +
+ Diffusion Models (DMs) have achieved State-Of-The-Art (SOTA) results in the +Lidar point cloud generation task, benefiting from their stable training and +iterative refinement during sampling. However, DMs often fail to realistically +model Lidar raydrop noise due to their inherent denoising process. To retain +the strength of iterative sampling while enhancing the generation of raydrop +noise, we introduce LidarGRIT, a generative model that uses auto-regressive +transformers to iteratively sample the range images in the latent space rather +than image space. Furthermore, LidarGRIT utilises VQ-VAE to separately decode +range images and raydrop masks. Our results show that LidarGRIT achieves +superior performance compared to SOTA models on KITTI-360 and KITTI odometry +datasets. Code available at:https://github.com/hamedhaghighi/LidarGRIT. + +
+
+
+
+
+ + ☆ Two-Person Interaction Augmentation with Skeleton Priors + + +
+ Close and continuous interaction with rich contacts is a crucial aspect of +human activities (e.g. hugging, dancing) and of interest in many domains like +activity recognition, motion prediction, character animation, etc. However, +acquiring such skeletal motion is challenging. While direct motion capture is +expensive and slow, motion editing/generation is also non-trivial, as complex +contact patterns with topological and geometric constraints have to be +retained. To this end, we propose a new deep learning method for two-body +skeletal interaction motion augmentation, which can generate variations of +contact-rich interactions with varying body sizes and proportions while +retaining the key geometric/topological relations between two bodies. Our +system can learn effectively from a relatively small amount of data and +generalize to drastically different skeleton sizes. Through exhaustive +evaluation and comparison, we show it can generate high-quality motions, has +strong generalizability and outperforms traditional optimization-based methods +and alternative deep learning solutions. + +
+
+
+
+
+ + ☆ Mind-to-Image: Projecting Visual Mental Imagination of the Brain from + fMRI + + +
+ The reconstruction of images observed by subjects from fMRI data collected +during visual stimuli has made significant strides in the past decade, thanks +to the availability of extensive fMRI datasets and advancements in generative +models for image generation. However, the application of visual reconstruction +has remained limited. Reconstructing visual imagination presents a greater +challenge, with potentially revolutionary applications ranging from aiding +individuals with disabilities to verifying witness accounts in court. The +primary hurdles in this field are the absence of data collection protocols for +visual imagery and the lack of datasets on the subject. Traditionally, +fMRI-to-image relies on data collected from subjects exposed to visual stimuli, +which poses issues for generating visual imagery based on the difference of +brain activity between visual stimulation and visual imagery. For the first +time, we have compiled a substantial dataset (around 6h of scans) on visual +imagery along with a proposed data collection protocol. We then train a +modified version of an fMRI-to-image model and demonstrate the feasibility of +reconstructing images from two modes of imagination: from memory and from pure +imagination. This marks an important step towards creating a technology that +allow direct reconstruction of visual imagery. + +
+
+ comment: Pre-print to be updated +
+
+
+
+
+ + ☆ Enhancing Lip Reading with Multi-Scale Video and Multi-Encoder + + +
+ Automatic lip-reading (ALR) aims to automatically transcribe spoken content +from a speaker's silent lip motion captured in video. Current mainstream +lip-reading approaches only use a single visual encoder to model input videos +of a single scale. In this paper, we propose to enhance lipreading by +incorporating multi-scale video data and multi-encoder. Specifically, we first +propose a novel multi-scale lip extraction algorithm based on the size of the +speaker's face and an enhanced ResNet3D visual front-end (VFE) to extract lip +features at different scales. For the multi-encoder, in addition to the +mainstream Transformer and Conformer, we also incorporate the recently proposed +Branchformer and EBranchformer as visual encoders. In the experiments, we +explore the influence of different video data scales and encoders on ALR system +performance and fuse the texts transcribed by all ALR systems using recognizer +output voting error reduction (ROVER). Finally, our proposed approach placed +second in the ICME 2024 ChatCLR Challenge Task 2, with a 21.52% reduction in +character error rate (CER) compared to the official baseline on the evaluation +set. + +
+
+ comment: 6 pages, 3 figures, submitted to ICME2024 GC-ChatCLR +
+
+
+
+
+ + ☆ HAMMR: HierArchical MultiModal React agents for generic VQA + + +
+ Combining Large Language Models (LLMs) with external specialized tools +(LLMs+tools) is a recent paradigm to solve multimodal tasks such as Visual +Question Answering (VQA). While this approach was demonstrated to work well +when optimized and evaluated for each individual benchmark, in practice it is +crucial for the next generation of real-world AI systems to handle a broad +range of multimodal problems. Therefore we pose the VQA problem from a unified +perspective and evaluate a single system on a varied suite of VQA tasks +including counting, spatial reasoning, OCR-based reasoning, visual pointing, +external knowledge, and more. In this setting, we demonstrate that naively +applying the LLM+tools approach using the combined set of all tools leads to +poor results. This motivates us to introduce HAMMR: HierArchical MultiModal +React. We start from a multimodal ReAct-based system and make it hierarchical +by enabling our HAMMR agents to call upon other specialized agents. This +enhances the compositionality of the LLM+tools approach, which we show to be +critical for obtaining high accuracy on generic VQA. Concretely, on our generic +VQA suite, HAMMR outperforms the naive LLM+tools approach by 19.5%. +Additionally, HAMMR achieves state-of-the-art results on this task, +outperforming the generic standalone PaLI-X VQA model by 5.0%. + +
+
+
+
+
+ + ☆ Pansharpening of PRISMA products for archaeological prospection RSS 2024 + + +
+ Hyperspectral data recorded from satellite platforms are often ill-suited for +geo-archaeological prospection due to low spatial resolution. The established +potential of hyperspectral data from airborne sensors in identifying +archaeological features has, on the other side, generated increased interest in +enhancing hyperspectral data to achieve higher spatial resolution. This +improvement is crucial for detecting traces linked to sub-surface +geo-archaeological features and can make satellite hyperspectral acquisitions +more suitable for archaeological research. This research assesses the usability +of pansharpened PRISMA satellite products in geo-archaeological prospections. +Three pan-sharpening methods (GSA, MTF-GLP and HySure) are compared +quantitatively and qualitatively and tested over the archaeological landscape +of Aquileia (Italy). The results suggest that the application of pansharpening +techniques makes hyperspectral satellite imagery highly suitable, under certain +conditions, to the identification of sub-surface archaeological features of +small and large size. + +
+
+ comment: Accepted to IEEE International Geoscience and Remote Sensing + Symposium 2024 (IGARSS 2024) @IEEE copyright +
+
+
+
+
+ + ☆ Action-conditioned video data improves predictability + + +
+ Long-term video generation and prediction remain challenging tasks in +computer vision, particularly in partially observable scenarios where cameras +are mounted on moving platforms. The interaction between observed image frames +and the motion of the recording agent introduces additional complexities. To +address these issues, we introduce the Action-Conditioned Video Generation +(ACVG) framework, a novel approach that investigates the relationship between +actions and generated image frames through a deep dual Generator-Actor +architecture. ACVG generates video sequences conditioned on the actions of +robots, enabling exploration and analysis of how vision and action mutually +influence one another in dynamic environments. We evaluate the framework's +effectiveness on an indoor robot motion dataset which consists of sequences of +image frames along with the sequences of actions taken by the robotic agent, +conducting a comprehensive empirical study comparing ACVG to other +state-of-the-art frameworks along with a detailed ablation study. + +
+
+
+
+
+ + ☆ Test-Time Zero-Shot Temporal Action Localization + + +
+ Zero-Shot Temporal Action Localization (ZS-TAL) seeks to identify and locate +actions in untrimmed videos unseen during training. Existing ZS-TAL methods +involve fine-tuning a model on a large amount of annotated training data. While +effective, training-based ZS-TAL approaches assume the availability of labeled +data for supervised learning, which can be impractical in some applications. +Furthermore, the training process naturally induces a domain bias into the +learned model, which may adversely affect the model's generalization ability to +arbitrary videos. These considerations prompt us to approach the ZS-TAL problem +from a radically novel perspective, relaxing the requirement for training data. +To this aim, we introduce a novel method that performs Test-Time adaptation for +Temporal Action Localization (T3AL). In a nutshell, T3AL adapts a pre-trained +Vision and Language Model (VLM). T3AL operates in three steps. First, a +video-level pseudo-label of the action category is computed by aggregating +information from the entire video. Then, action localization is performed +adopting a novel procedure inspired by self-supervised learning. Finally, +frame-level textual descriptions extracted with a state-of-the-art captioning +model are employed for refining the action region proposals. We validate the +effectiveness of T3AL by conducting experiments on the THUMOS14 and the +ActivityNet-v1.3 datasets. Our results demonstrate that T3AL significantly +outperforms zero-shot baselines based on state-of-the-art VLMs, confirming the +benefit of a test-time adaptation approach. + +
+
+
+
+
+ + ☆ Two Hands Are Better Than One: Resolving Hand to Hand Intersections via + Occupancy Networks + + +
+ 3D hand pose estimation from images has seen considerable interest from the +literature, with new methods improving overall 3D accuracy. One current +challenge is to address hand-to-hand interaction where self-occlusions and +finger articulation pose a significant problem to estimation. Little work has +applied physical constraints that minimize the hand intersections that occur as +a result of noisy estimation. This work addresses the intersection of hands by +exploiting an occupancy network that represents the hand's volume as a +continuous manifold. This allows us to model the probability distribution of +points being inside a hand. We designed an intersection loss function to +minimize the likelihood of hand-to-point intersections. Moreover, we propose a +new hand mesh parameterization that is superior to the commonly used MANO model +in many respects including lower mesh complexity, underlying 3D skeleton +extraction, watertightness, etc. On the benchmark InterHand2.6M dataset, the +models trained using our intersection loss achieve better results than the +state-of-the-art by significantly decreasing the number of hand intersections +while lowering the mean per-joint positional error. Additionally, we +demonstrate superior performance for 3D hand uplift on Re:InterHand and SMILE +datasets and show reduced hand-to-hand intersections for complex domains such +as sign-language pose estimation. + +
+
+
+
+
+ + ☆ Anatomical Conditioning for Contrastive Unpaired Image-to-Image + Translation of Optical Coherence Tomography Images + + +
+ For a unified analysis of medical images from different modalities, data +harmonization using image-to-image (I2I) translation is desired. We study this +problem employing an optical coherence tomography (OCT) data set of +Spectralis-OCT and Home-OCT images. I2I translation is challenging because the +images are unpaired, and a bijective mapping does not exist due to the +information discrepancy between both domains. This problem has been addressed +by the Contrastive Learning for Unpaired I2I Translation (CUT) approach, but it +reduces semantic consistency. To restore the semantic consistency, we support +the style decoder using an additional segmentation decoder. Our approach +increases the similarity between the style-translated images and the target +distribution. Importantly, we improve the segmentation of biomarkers in +Home-OCT images in an unsupervised domain adaptation scenario. Our data +harmonization approach provides potential for the monitoring of diseases, e.g., +age related macular disease, using different OCT devices. + +
+
+ comment: Accepted at ISBI 2024 +
+
+
+
+
+ + ☆ PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation + + +
+ Beyond class frequency, we recognize the impact of class-wise relationships +among various class-specific predictions and the imbalance in label masks on +long-tailed segmentation learning. To address these challenges, we propose an +innovative Pixel-wise Adaptive Training (PAT) technique tailored for +long-tailed segmentation. PAT has two key features: 1) class-wise gradient +magnitude homogenization, and 2) pixel-wise class-specific loss adaptation +(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate +the imbalance among label masks by ensuring equal consideration of the +class-wise impact on model updates. Second, PCLA tackles the detrimental impact +of both rare classes within the long-tailed distribution and inaccurate +predictions from previous training stages by encouraging learning classes with +low prediction confidence and guarding against forgetting classes with high +confidence. This combined approach fosters robust learning while preventing the +model from forgetting previously learned knowledge. PAT exhibits significant +performance improvements, surpassing the current state-of-the-art by 2.2% in +the NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and +intersection over union value by 2.07%, with a particularly notable declination +of 0.39% in detecting rare classes compared to Balance Logits Variation, as +demonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and +NYU. + +
+
+
+
+
+ + ☆ T-DEED: Temporal-Discriminability Enhancer Encoder-Decoder for Precise + Event Spotting in Sports Videos + + +
+ In this paper, we introduce T-DEED, a Temporal-Discriminability Enhancer +Encoder-Decoder for Precise Event Spotting in sports videos. T-DEED addresses +multiple challenges in the task, including the need for discriminability among +frame representations, high output temporal resolution to maintain prediction +precision, and the necessity to capture information at different temporal +scales to handle events with varying dynamics. It tackles these challenges +through its specifically designed architecture, featuring an encoder-decoder +for leveraging multiple temporal scales and achieving high output temporal +resolution, along with temporal modules designed to increase token +discriminability. Leveraging these characteristics, T-DEED achieves SOTA +performance on the FigureSkating and FineDiving datasets. + +
+
+
+
+
+ + ☆ Rethinking the Spatial Inconsistency in Classifier-Free Diffusion + Guidance CVPR-2024 + + +
+ Classifier-Free Guidance (CFG) has been widely used in text-to-image +diffusion models, where the CFG scale is introduced to control the strength of +text guidance on the whole image space. However, we argue that a global CFG +scale results in spatial inconsistency on varying semantic strengths and +suboptimal image quality. To address this problem, we present a novel approach, +Semantic-aware Classifier-Free Guidance (S-CFG), to customize the guidance +degrees for different semantic units in text-to-image diffusion models. +Specifically, we first design a training-free semantic segmentation method to +partition the latent image into relatively independent semantic regions at each +denoising step. In particular, the cross-attention map in the denoising U-net +backbone is renormalized for assigning each patch to the corresponding token, +while the self-attention map is used to complete the semantic regions. Then, to +balance the amplification of diverse semantic units, we adaptively adjust the +CFG scales across different semantic regions to rescale the text guidance +degrees into a uniform level. Finally, extensive experiments demonstrate the +superiority of S-CFG over the original CFG strategy on various text-to-image +diffusion models, without requiring any extra training cost. our codes are +available at https://github.com/SmilesDZgk/S-CFG. + +
+
+ comment: accepted by CVPR-2024 +
+
+
+
+
+ + ☆ CDAD-Net: Bridging Domain Gaps in Generalized Category Discovery CVPR + + +
+ In Generalized Category Discovery (GCD), we cluster unlabeled samples of +known and novel classes, leveraging a training dataset of known classes. A +salient challenge arises due to domain shifts between these datasets. To +address this, we present a novel setting: Across Domain Generalized Category +Discovery (AD-GCD) and bring forth CDAD-NET (Class Discoverer Across Domains) +as a remedy. CDAD-NET is architected to synchronize potential known class +samples across both the labeled (source) and unlabeled (target) datasets, while +emphasizing the distinct categorization of the target data. To facilitate this, +we propose an entropy-driven adversarial learning strategy that accounts for +the distance distributions of target samples relative to source-domain class +prototypes. Parallelly, the discriminative nature of the shared space is upheld +through a fusion of three metric learning objectives. In the source domain, our +focus is on refining the proximity between samples and their affiliated class +prototypes, while in the target domain, we integrate a neighborhood-centric +contrastive learning mechanism, enriched with an adept neighborsmining +approach. To further accentuate the nuanced feature interrelation among +semantically aligned images, we champion the concept of conditional image +inpainting, underscoring the premise that semantically analogous images prove +more efficacious to the task than their disjointed counterparts. +Experimentally, CDAD-NET eclipses existing literature with a performance +increment of 8-15% on three AD-GCD benchmarks we present. + +
+
+ comment: Accepted in L3D-IVU, CVPR Workshop, 2024 +
+
+
+
+
+ + ☆ Multi-head Attention-based Deep Multiple Instance Learning + + +
+ This paper introduces MAD-MIL, a Multi-head Attention-based Deep Multiple +Instance Learning model, designed for weakly supervised Whole Slide Images +(WSIs) classification in digital pathology. Inspired by the multi-head +attention mechanism of the Transformer, MAD-MIL simplifies model complexity +while achieving competitive results against advanced models like CLAM and +DS-MIL. Evaluated on the MNIST-BAGS and public datasets, including TUPAC16, +TCGA BRCA, TCGA LUNG, and TCGA KIDNEY, MAD-MIL consistently outperforms ABMIL. +This demonstrates enhanced information diversity, interpretability, and +efficiency in slide representation. The model's effectiveness, coupled with +fewer trainable parameters and lower computational complexity makes it a +promising solution for automated pathology workflows. Our code is available at +https://github.com/tueimage/MAD-MIL. + +
+
+
+
+
+ + ☆ CNN-based Game State Detection for a Foosball Table + + +
+ The automation of games using Deep Reinforcement Learning Strategies (DRL) is +a well-known challenge in AI research. While for feature extraction in a video +game typically the whole image is used, this is hardly practical for many real +world games. Instead, using a smaller game state reducing the dimension of the +parameter space to include essential parameters only seems to be a promising +approach. In the game of Foosball, a compact and comprehensive game state +description consists of the positional shifts and rotations of the figures and +the position of the ball over time. In particular, velocities and accelerations +can be derived from consecutive time samples of the game state. In this paper, +a figure detection system to determine the game state in Foosball is presented. +We capture a dataset containing the rotations of the rods which were measured +using accelerometers and the positional shifts were derived using traditional +Computer Vision techniques (in a laboratory setting). This dataset is utilized +to train Convolutional Neural Network (CNN) based end-to-end regression models +to predict the rotations and shifts of each rod. We present an evaluation of +our system using different state-of-the-art CNNs as base architectures for the +regression model. We show that our system is able to predict the game state +with high accuracy. By providing data for both black and white teams, the +presented system is intended to provide the required data for future +developments of Imitation Learning techniques w.r.t. to observing human +players. + +
+
+
+
+
+ + ☆ Iterative Refinement Strategy for Automated Data Labeling: Facial + Landmark Diagnosis in Medical Imaging + + +
+ Automated data labeling techniques are crucial for accelerating the +development of deep learning models, particularly in complex medical imaging +applications. However, ensuring accuracy and efficiency remains challenging. +This paper presents iterative refinement strategies for automated data labeling +in facial landmark diagnosis to enhance accuracy and efficiency for deep +learning models in medical applications, including dermatology, plastic +surgery, and ophthalmology. Leveraging feedback mechanisms and advanced +algorithms, our approach iteratively refines initial labels, reducing reliance +on manual intervention while improving label quality. Through empirical +evaluation and case studies, we demonstrate the effectiveness of our proposed +strategies in deep learning tasks across medical imaging domains. Our results +highlight the importance of iterative refinement in automated data labeling to +enhance the capabilities of deep learning systems in medical imaging +applications. + +
+
+
+
+
+ + ☆ Comparative Analysis of Image Enhancement Techniques for Brain Tumor + Segmentation: Contrast, Histogram, and Hybrid Approaches + + +
+ This study systematically investigates the impact of image enhancement +techniques on Convolutional Neural Network (CNN)-based Brain Tumor +Segmentation, focusing on Histogram Equalization (HE), Contrast Limited +Adaptive Histogram Equalization (CLAHE), and their hybrid variations. Employing +the U-Net architecture on a dataset of 3064 Brain MRI images, the research +delves into preprocessing steps, including resizing and enhancement, to +optimize segmentation accuracy. A detailed analysis of the CNN-based U-Net +architecture, training, and validation processes is provided. The comparative +analysis, utilizing metrics such as Accuracy, Loss, MSE, IoU, and DSC, reveals +that the hybrid approach CLAHE-HE consistently outperforms others. Results +highlight its superior accuracy (0.9982, 0.9939, 0.9936 for training, testing, +and validation, respectively) and robust segmentation overlap, with Jaccard +values of 0.9862, 0.9847, and 0.9864, and Dice values of 0.993, 0.9923, and +0.9932 for the same phases, emphasizing its potential in neuro-oncological +applications. The study concludes with a call for refinement in segmentation +methodologies to further enhance diagnostic precision and treatment planning in +neuro-oncology. + +
+
+ comment: 9 Pages, & Figures, 2 Tables, International Conference on Computer + Science Electronics and Information (ICCSEI 2023) +
+
+
+
+
+ + ☆ Mask-ControlNet: Higher-Quality Image Generation with An Additional Mask + Prompt + + +
+ Text-to-image generation has witnessed great progress, especially with the +recent advancements in diffusion models. Since texts cannot provide detailed +conditions like object appearance, reference images are usually leveraged for +the control of objects in the generated images. However, existing methods still +suffer limited accuracy when the relationship between the foreground and +background is complicated. To address this issue, we develop a framework termed +Mask-ControlNet by introducing an additional mask prompt. Specifically, we +first employ large vision models to obtain masks to segment the objects of +interest in the reference image. Then, the object images are employed as +additional prompts to facilitate the diffusion model to better understand the +relationship between foreground and background regions during image generation. +Experiments show that the mask prompts enhance the controllability of the +diffusion model to maintain higher fidelity to the reference image while +achieving better image quality. Comparison with previous text-to-image +generation methods demonstrates our method's superior quantitative and +qualitative performance on the benchmark datasets. + +
+
+
+
+
+ + ☆ WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A + Conceptual Architecture + + +
+ This work proposes a WebXR-based cross-platform conceptual architecture, +leveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate +the development of an open, accessible, and interoperable metaverse. By +introducing the concept of spatial web app, this research contributes to the +discourse on the metaverse, offering an architecture that democratizes access +to virtual environments and extended reality through the web, and aligns with +Tim Berners-Lee's original vision of the World Wide Web as an open platform in +the digital realm. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ CLIPping the Limits: Finding the Sweet Spot for Relevant Images in + Automated Driving Systems Perception Testing + + +
+ Perception systems, especially cameras, are the eyes of automated driving +systems. Ensuring that they function reliably and robustly is therefore an +important building block in the automation of vehicles. There are various +approaches to test the perception of automated driving systems. Ultimately, +however, it always comes down to the investigation of the behavior of +perception systems under specific input data. Camera images are a crucial part +of the input data. Image data sets are therefore collected for the testing of +automated driving systems, but it is non-trivial to find specific images in +these data sets. Thanks to recent developments in neural networks, there are +now methods for sorting the images in a data set according to their similarity +to a prompt in natural language. In order to further automate the provision of +search results, we make a contribution by automating the threshold definition +in these sorted results and returning only the images relevant to the prompt as +a result. Our focus is on preventing false positives and false negatives +equally. It is also important that our method is robust and in the case that +our assumptions are not fulfilled, we provide a fallback solution. + +
+
+
+
+
+ + ☆ Human Detection from 4D Radar Data in Low-Visibility Field Conditions ICRA 2024 + + +
+ Autonomous driving technology is increasingly being used on public roads and +in industrial settings such as mines. While it is essential to detect +pedestrians, vehicles, or other obstacles, adverse field conditions negatively +affect the performance of classical sensors such as cameras or lidars. Radar, +on the other hand, is a promising modality that is less affected by, e.g., +dust, smoke, water mist or fog. In particular, modern 4D imaging radars provide +target responses across the range, vertical angle, horizontal angle and Doppler +velocity dimensions. We propose TMVA4D, a CNN architecture that leverages this +4D radar modality for semantic segmentation. The CNN is trained to distinguish +between the background and person classes based on a series of 2D projections +of the 4D radar data that include the elevation, azimuth, range, and Doppler +velocity dimensions. We also outline the process of compiling a novel dataset +consisting of data collected in industrial settings with a car-mounted 4D radar +and describe how the ground-truth labels were generated from reference thermal +images. Using TMVA4D on this dataset, we achieve an mIoU score of 78.2% and an +mDice score of 86.1%, evaluated on the two classes background and person + +
+
+ comment: Submitted to Radar in Robotics workshop at ICRA 2024 +
+
+
+
+
+ + ☆ Texture Classification Network Integrating Adaptive Wavelet Transform + + +
+ Graves' disease is a common condition that is diagnosed clinically by +determining the smoothness of the thyroid texture and its morphology in +ultrasound images. Currently, the most widely used approach for the automated +diagnosis of Graves' disease utilizes Convolutional Neural Networks (CNNs) for +both feature extraction and classification. However, these methods demonstrate +limited efficacy in capturing texture features. Given the high capacity of +wavelets in describing texture features, this research integrates learnable +wavelet modules utilizing the Lifting Scheme into CNNs and incorporates a +parallel wavelet branch into the ResNet18 model to enhance texture feature +extraction. Our model can analyze texture features in spatial and frequency +domains simultaneously, leading to optimized classification accuracy. We +conducted experiments on collected ultrasound datasets and publicly available +natural image texture datasets, our proposed network achieved 97.27% accuracy +and 95.60% recall on ultrasound datasets, 60.765% accuracy on natural image +texture datasets, surpassing the accuracy of ResNet and conrming the +effectiveness of our approach. + +
+
+
+
+
+ + ☆ MindSet: Vision. A toolbox for testing DNNs on key psychological + experiments + + +
+ Multiple benchmarks have been developed to assess the alignment between deep +neural networks (DNNs) and human vision. In almost all cases these benchmarks +are observational in the sense they are composed of behavioural and brain +responses to naturalistic images that have not been manipulated to test +hypotheses regarding how DNNs or humans perceive and identify objects. Here we +introduce the toolbox MindSet: Vision, consisting of a collection of image +datasets and related scripts designed to test DNNs on 30 psychological +findings. In all experimental conditions, the stimuli are systematically +manipulated to test specific hypotheses regarding human visual perception and +object recognition. In addition to providing pre-generated datasets of images, +we provide code to regenerate these datasets, offering many configurable +parameters which greatly extend the dataset versatility for different research +contexts, and code to facilitate the testing of DNNs on these image datasets +using three different methods (similarity judgments, out-of-distribution +classification, and decoder method), accessible at +https://github.com/MindSetVision/mindset-vision. We test ResNet-152 on each of +these methods as an example of how the toolbox can be used. + +
+
+
+
+
+ + ☆ Detecting Every Object from Events + + +
+ Object detection is critical in autonomous driving, and it is more practical +yet challenging to localize objects of unknown categories: an endeavour known +as Class-Agnostic Object Detection (CAOD). Existing studies on CAOD +predominantly rely on ordinary cameras, but these frame-based sensors usually +have high latency and limited dynamic range, leading to safety risks in +real-world scenarios. In this study, we turn to a new modality enabled by the +so-called event camera, featured by its sub-millisecond latency and high +dynamic range, for robust CAOD. We propose Detecting Every Object in Events +(DEOE), an approach tailored for achieving high-speed, class-agnostic +open-world object detection in event-based vision. Built upon the fast +event-based backbone: recurrent vision transformer, we jointly consider the +spatial and temporal consistencies to identify potential objects. The +discovered potential objects are assimilated as soft positive samples to avoid +being suppressed as background. Moreover, we introduce a disentangled +objectness head to separate the foreground-background classification and novel +object discovery tasks, enhancing the model's generalization in localizing +novel objects while maintaining a strong ability to filter out the background. +Extensive experiments confirm the superiority of our proposed DEOE in +comparison with three strong baseline methods that integrate the +state-of-the-art event-based object detector with advancements in RGB-based +CAOD. Our code is available at https://github.com/Hatins/DEOE. + +
+
+
+
+
+ + ☆ MOSE: Boosting Vision-based Roadside 3D Object Detection with Scene Cues + + +
+ 3D object detection based on roadside cameras is an additional way for +autonomous driving to alleviate the challenges of occlusion and short +perception range from vehicle cameras. Previous methods for roadside 3D object +detection mainly focus on modeling the depth or height of objects, neglecting +the stationary of cameras and the characteristic of inter-frame consistency. In +this work, we propose a novel framework, namely MOSE, for MOnocular 3D object +detection with Scene cuEs. The scene cues are the frame-invariant +scene-specific features, which are crucial for object localization and can be +intuitively regarded as the height between the surface of the real road and the +virtual ground plane. In the proposed framework, a scene cue bank is designed +to aggregate scene cues from multiple frames of the same scene with a carefully +designed extrinsic augmentation strategy. Then, a transformer-based decoder +lifts the aggregated scene cues as well as the 3D position embeddings for 3D +object location, which boosts generalization ability in heterologous scenes. +The extensive experiment results on two public benchmarks demonstrate the +state-of-the-art performance of the proposed method, which surpasses the +existing methods by a large margin. + +
+
+
+
+
+ + ☆ Deep Optics for Video Snapshot Compressive Imaging ICCV 2023 + + +
+ Video snapshot compressive imaging (SCI) aims to capture a sequence of video +frames with only a single shot of a 2D detector, whose backbones rest in +optical modulation patterns (also known as masks) and a computational +reconstruction algorithm. Advanced deep learning algorithms and mature hardware +are putting video SCI into practical applications. Yet, there are two clouds in +the sunshine of SCI: i) low dynamic range as a victim of high temporal +multiplexing, and ii) existing deep learning algorithms' degradation on real +system. To address these challenges, this paper presents a deep optics +framework to jointly optimize masks and a reconstruction network. Specifically, +we first propose a new type of structural mask to realize motion-aware and +full-dynamic-range measurement. Considering the motion awareness property in +measurement domain, we develop an efficient network for video SCI +reconstruction using Transformer to capture long-term temporal dependencies, +dubbed Res2former. Moreover, sensor response is introduced into the forward +model of video SCI to guarantee end-to-end model training close to real system. +Finally, we implement the learned structural masks on a digital micro-mirror +device. Experimental results on synthetic and real data validate the +effectiveness of the proposed framework. We believe this is a milestone for +real-world video SCI. The source code and data are available at +https://github.com/pwangcs/DeepOpticsSCI. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation + + +
+ Customized text-to-image generation aims to synthesize instantiations of +user-specified concepts and has achieved unprecedented progress in handling +individual concept. However, when extending to multiple customized concepts, +existing methods exhibit limitations in terms of flexibility and fidelity, only +accommodating the combination of limited types of models and potentially +resulting in a mix of characteristics from different concepts. In this paper, +we introduce the Multi-concept guidance for Multi-concept customization, termed +MC$^2$, for improved flexibility and fidelity. MC$^2$ decouples the +requirements for model architecture via inference time optimization, allowing +the integration of various heterogeneous single-concept customized models. It +adaptively refines the attention weights between visual and textual tokens, +directing image regions to focus on their associated words while diminishing +the impact of irrelevant ones. Extensive experiments demonstrate that MC$^2$ +even surpasses previous methods that require additional training in terms of +consistency with input prompt and reference images. Moreover, MC$^2$ can be +extended to elevate the compositional capabilities of text-to-image generation, +yielding appealing results. Code will be publicly available at +https://github.com/JIANGJiaXiu/MC-2. + +
+
+
+
+
+ + ☆ Unbridled Icarus: A Survey of the Potential Perils of Image Inputs in + Multimodal Large Language Model Security + + +
+ Multimodal Large Language Models (MLLMs) demonstrate remarkable capabilities +that increasingly influence various aspects of our daily lives, constantly +defining the new boundary of Artificial General Intelligence (AGI). Image +modalities, enriched with profound semantic information and a more continuous +mathematical nature compared to other modalities, greatly enhance the +functionalities of MLLMs when integrated. However, this integration serves as a +double-edged sword, providing attackers with expansive vulnerabilities to +exploit for highly covert and harmful attacks. The pursuit of reliable AI +systems like powerful MLLMs has emerged as a pivotal area of contemporary +research. In this paper, we endeavor to demostrate the multifaceted risks +associated with the incorporation of image modalities into MLLMs. Initially, we +delineate the foundational components and training processes of MLLMs. +Subsequently, we construct a threat model, outlining the security +vulnerabilities intrinsic to MLLMs. Moreover, we analyze and summarize existing +scholarly discourses on MLLMs' attack and defense mechanisms, culminating in +suggestions for the future research on MLLM security. Through this +comprehensive analysis, we aim to deepen the academic understanding of MLLM +security challenges and propel forward the development of trustworthy MLLM +systems. + +
+
+ comment: 8 pages, 1 figure +
+
+
+
+
+ + ☆ Unsupervised Band Selection Using Fused HSI and LiDAR Attention + Integrating With Autoencoder + + +
+ Band selection in hyperspectral imaging (HSI) is critical for optimising data +processing and enhancing analytical accuracy. Traditional approaches have +predominantly concentrated on analysing spectral and pixel characteristics +within individual bands independently. These approaches overlook the potential +benefits of integrating multiple data sources, such as Light Detection and +Ranging (LiDAR), and is further challenged by the limited availability of +labeled data in HSI processing, which represents a significant obstacle. To +address these challenges, this paper introduces a novel unsupervised band +selection framework that incorporates attention mechanisms and an Autoencoder +for reconstruction-based band selection. Our methodology distinctively +integrates HSI with LiDAR data through an attention score, using a +convolutional Autoencoder to process the combined feature mask. This fusion +effectively captures essential spatial and spectral features and reduces +redundancy in hyperspectral datasets. A comprehensive comparative analysis of +our innovative fused band selection approach is performed against existing +unsupervised band selection and fusion models. We used data sets such as +Houston 2013, Trento, and MUUFLE for our experiments. The results demonstrate +that our method achieves superior classification accuracy and significantly +outperforms existing models. This enhancement in HSI band selection, +facilitated by the incorporation of LiDAR features, underscores the +considerable advantages of integrating features from different sources. + +
+
+ comment: 13 pages, 13figures, 6 tables +
+
+
+
+
+ + ☆ Text-to-Image Synthesis for Any Artistic Styles: Advancements in + Personalized Artistic Image Generation via Subdivision and Dual Binding + + +
+ Recent advancements in text-to-image models, such as Stable Diffusion, have +demonstrated their ability to synthesize visual images through natural language +prompts. One approach of personalizing text-to-image models, exemplified by +DreamBooth, fine-tunes the pre-trained model by binding unique text identifiers +with a few images of a specific subject. Although existing fine-tuning methods +have demonstrated competence in rendering images according to the styles of +famous painters, it is still challenging to learn to produce images +encapsulating distinct art styles due to abstract and broad visual perceptions +of stylistic attributes such as lines, shapes, textures, and colors. In this +paper, we introduce a new method, Single-StyleForge, for personalization. It +fine-tunes pre-trained text-to-image diffusion models to generate diverse +images in specified styles from text prompts. By using around 15-20 images of +the target style, the approach establishes a foundational binding of a unique +token identifier with a broad range of the target style. It also utilizes +auxiliary images to strengthen this binding, resulting in offering specific +guidance on representing elements such as persons in a target style-consistent +manner. In addition, we present ways to improve the quality of style and +text-image alignment through a method called Multi-StyleForge, which inherits +the strategy used in StyleForge and learns tokens in multiple. Experimental +evaluation conducted on six distinct artistic styles demonstrates substantial +improvements in both the quality of generated images and the perceptual +fidelity metrics, such as FID, KID, and CLIP scores. + +
+
+ comment: 20 pages, 12 figuers +
+
+
+
+
+ + ☆ CodeEnhance: A Codebook-Driven Approach for Low-Light Image Enhancement + + +
+ Low-light image enhancement (LLIE) aims to improve low-illumination images. +However, existing methods face two challenges: (1) uncertainty in restoration +from diverse brightness degradations; (2) loss of texture and color information +caused by noise suppression and light enhancement. In this paper, we propose a +novel enhancement approach, CodeEnhance, by leveraging quantized priors and +image refinement to address these challenges. In particular, we reframe LLIE as +learning an image-to-code mapping from low-light images to discrete codebook, +which has been learned from high-quality images. To enhance this process, a +Semantic Embedding Module (SEM) is introduced to integrate semantic information +with low-level features, and a Codebook Shift (CS) mechanism, designed to adapt +the pre-learned codebook to better suit the distinct characteristics of our +low-light dataset. Additionally, we present an Interactive Feature +Transformation (IFT) module to refine texture and color information during +image reconstruction, allowing for interactive enhancement based on user +preferences. Extensive experiments on both real-world and synthetic benchmarks +demonstrate that the incorporation of prior knowledge and controllable +information transfer significantly enhances LLIE performance in terms of +quality and fidelity. The proposed CodeEnhance exhibits superior robustness to +various degradations, including uneven illumination, noise, and color +distortion. + +
+
+ comment: 10 pages, 13 figures +
+
+
+
+
+ + ☆ Allowing humans to interactively guide machines where to look does not + always improve a human-AI team's classification accuracy CVPR + 2024 + + +
+ Via thousands of papers in Explainable AI (XAI), attention maps +\cite{vaswani2017attention} and feature attribution maps \cite{bansal2020sam} +have been established as a common means for explaining the input features that +are important to AI's decisions. It is an interesting but unexplored question +whether allowing users to edit the importance scores of input features at test +time would improve the human-AI team's accuracy on downstream tasks. In this +paper, we address this question by taking CHM-Corr, a state-of-the-art, +ante-hoc explanation method \cite{taesiri2022visual} that first predicts +patch-wise correspondences between the input and the training-set images, and +then uses them to make classification decisions. We build an interactive +interface on top of CHM-Corr, enabling users to directly edit the initial +feature attribution map provided by CHM-Corr. Via our CHM-Corr++ interface, +users gain insights into if, when, and how the model changes its outputs, +enhancing understanding beyond static explanations. Our user study with 18 +machine learning researchers who performed $\sim$1,400 decisions shows that our +interactive approach does not improve user accuracy on CUB-200 bird image +classification over static explanations. This challenges the belief that +interactivity inherently boosts XAI +effectiveness~\cite{sokol2020one,sun2022exploring,shen2024towards,singh2024rethinking,mindlin2024beyond,lakkaraju2022rethinking,cheng2019explaining,liu2021understanding} +and raises needs for future research. Our work contributes to the field by +open-sourcing an interactive tool for manipulating model attention, and it lays +the groundwork for future research to enable effective human-AI interaction in +computer vision. We release code and data on +\href{https://anonymous.4open.science/r/CHMCorrPlusPlus/}{github}. Our +interface are available \href{http://137.184.82.109:7080/}{here}. + +
+
+ comment: Accepted for presentation at the XAI4CV Workshop, part of the CVPR + 2024 proceedings +
+
+
+
+
+ + ☆ Stylizing Sparse-View 3D Scenes with Hierarchical Neural Representation + + +
+ Recently, a surge of 3D style transfer methods has been proposed that +leverage the scene reconstruction power of a pre-trained neural radiance field +(NeRF). To successfully stylize a scene this way, one must first reconstruct a +photo-realistic radiance field from collected images of the scene. However, +when only sparse input views are available, pre-trained few-shot NeRFs often +suffer from high-frequency artifacts, which are generated as a by-product of +high-frequency details for improving reconstruction quality. Is it possible to +generate more faithful stylized scenes from sparse inputs by directly +optimizing encoding-based scene representation with target style? In this +paper, we consider the stylization of sparse-view scenes in terms of +disentangling content semantics and style textures. We propose a coarse-to-fine +sparse-view scene stylization framework, where a novel hierarchical +encoding-based neural representation is designed to generate high-quality +stylized scenes directly from implicit scene representations. We also propose a +new optimization strategy with content strength annealing to achieve realistic +stylization and better content preservation. Extensive experiments demonstrate +that our method can achieve high-quality stylization of sparse-view scenes and +outperforms fine-tuning-based baselines in terms of stylization quality and +efficiency. + +
+
+
+
+
+ + ☆ PromptAD: Learning Prompts with only Normal Samples for Few-Shot Anomaly + Detection CVPR2024 + + +
+ The vision-language model has brought great improvement to few-shot +industrial anomaly detection, which usually needs to design of hundreds of +prompts through prompt engineering. For automated scenarios, we first use +conventional prompt learning with many-class paradigm as the baseline to +automatically learn prompts but found that it can not work well in one-class +anomaly detection. To address the above problem, this paper proposes a +one-class prompt learning method for few-shot anomaly detection, termed +PromptAD. First, we propose semantic concatenation which can transpose normal +prompts into anomaly prompts by concatenating normal prompts with anomaly +suffixes, thus constructing a large number of negative samples used to guide +prompt learning in one-class setting. Furthermore, to mitigate the training +challenge caused by the absence of anomaly images, we introduce the concept of +explicit anomaly margin, which is used to explicitly control the margin between +normal prompt features and anomaly prompt features through a hyper-parameter. +For image-level/pixel-level anomaly detection, PromptAD achieves first place in +11/12 few-shot settings on MVTec and VisA. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ LayoutLLM: Layout Instruction Tuning with Large Language Models for + Document Understanding CVPR 2024 + + +
+ Recently, leveraging large language models (LLMs) or multimodal large +language models (MLLMs) for document understanding has been proven very +promising. However, previous works that employ LLMs/MLLMs for document +understanding have not fully explored and utilized the document layout +information, which is vital for precise document understanding. In this paper, +we propose LayoutLLM, an LLM/MLLM based method for document understanding. The +core of LayoutLLM is a layout instruction tuning strategy, which is specially +designed to enhance the comprehension and utilization of document layouts. The +proposed layout instruction tuning strategy consists of two components: +Layout-aware Pre-training and Layout-aware Supervised Fine-tuning. To capture +the characteristics of document layout in Layout-aware Pre-training, three +groups of pre-training tasks, corresponding to document-level, region-level and +segment-level information, are introduced. Furthermore, a novel module called +layout chain-of-thought (LayoutCoT) is devised to enable LayoutLLM to focus on +regions relevant to the question and generate accurate answers. LayoutCoT is +effective for boosting the performance of document understanding. Meanwhile, it +brings a certain degree of interpretability, which could facilitate manual +inspection and correction. Experiments on standard benchmarks show that the +proposed LayoutLLM significantly outperforms existing methods that adopt +open-source 7B LLMs/MLLMs for document understanding. The training data of the +LayoutLLM is publicly available at +https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/DocumentUnderstanding/LayoutLLM + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ StylizedGS: Controllable Stylization for 3D Gaussian Splatting + + +
+ With the rapid development of XR, 3D generation and editing are becoming more +and more important, among which, stylization is an important tool of 3D +appearance editing. It can achieve consistent 3D artistic stylization given a +single reference style image and thus is a user-friendly editing way. However, +recent NeRF-based 3D stylization methods face efficiency issues that affect the +actual user experience and the implicit nature limits its ability to transfer +the geometric pattern styles. Additionally, the ability for artists to exert +flexible control over stylized scenes is considered highly desirable, fostering +an environment conducive to creative exploration. In this paper, we introduce +StylizedGS, a 3D neural style transfer framework with adaptable control over +perceptual factors based on 3D Gaussian Splatting (3DGS) representation. The +3DGS brings the benefits of high efficiency. We propose a GS filter to +eliminate floaters in the reconstruction which affects the stylization effects +before stylization. Then the nearest neighbor-based style loss is introduced to +achieve stylization by fine-tuning the geometry and color parameters of 3DGS, +while a depth preservation loss with other regularizations is proposed to +prevent the tampering of geometry content. Moreover, facilitated by specially +designed losses, StylizedGS enables users to control color, stylized scale and +regions during the stylization to possess customized capabilities. Our method +can attain high-quality stylization results characterized by faithful +brushstrokes and geometric consistency with flexible controls. Extensive +experiments across various scenes and styles demonstrate the effectiveness and +efficiency of our method concerning both stylization quality and inference FPS. + +
+
+
+
+
+ + ☆ Multi-agent Long-term 3D Human Pose Forecasting via Interaction-aware + Trajectory Conditioning CVPR + + +
+ Human pose forecasting garners attention for its diverse applications. +However, challenges in modeling the multi-modal nature of human motion and +intricate interactions among agents persist, particularly with longer +timescales and more agents. In this paper, we propose an interaction-aware +trajectory-conditioned long-term multi-agent human pose forecasting model, +utilizing a coarse-to-fine prediction approach: multi-modal global trajectories +are initially forecasted, followed by respective local pose forecasts +conditioned on each mode. In doing so, our Trajectory2Pose model introduces a +graph-based agent-wise interaction module for a reciprocal forecast of local +motion-conditioned global trajectory and trajectory-conditioned local pose. Our +model effectively handles the multi-modality of human motion and the complexity +of long-term multi-agent interactions, improving performance in complex +environments. Furthermore, we address the lack of long-term (6s+) multi-agent +(5+) datasets by constructing a new dataset from real-world images and 2D +annotations, enabling a comprehensive evaluation of our proposed model. +State-of-the-art prediction performance on both complex and simpler datasets +confirms the generalized effectiveness of our method. The code is available at +https://github.com/Jaewoo97/T2P. + +
+
+ comment: 2024 CVPR Highlight +
+
+
+
+
+ + ☆ Spatio-Temporal Attention and Gaussian Processes for Personalized Video + Gaze Estimation CVPR 2024 + + +
+ Gaze is an essential prompt for analyzing human behavior and attention. +Recently, there has been an increasing interest in determining gaze direction +from facial videos. However, video gaze estimation faces significant +challenges, such as understanding the dynamic evolution of gaze in video +sequences, dealing with static backgrounds, and adapting to variations in +illumination. To address these challenges, we propose a simple and novel deep +learning model designed to estimate gaze from videos, incorporating a +specialized attention module. Our method employs a spatial attention mechanism +that tracks spatial dynamics within videos. This technique enables accurate +gaze direction prediction through a temporal sequence model, adeptly +transforming spatial observations into temporal insights, thereby significantly +improving gaze estimation accuracy. Additionally, our approach integrates +Gaussian processes to include individual-specific traits, facilitating the +personalization of our model with just a few labeled samples. Experimental +results confirm the efficacy of the proposed approach, demonstrating its +success in both within-dataset and cross-dataset settings. Specifically, our +proposed approach achieves state-of-the-art performance on the Gaze360 dataset, +improving by $2.5^\circ$ without personalization. Further, by personalizing the +model with just three samples, we achieved an additional improvement of +$0.8^\circ$. The code and pre-trained models are available at +\url{https://github.com/jswati31/stage}. + +
+
+ comment: Accepted at CVPR 2024 Gaze workshop +
+
+
+
+
+ + ☆ DiffCJK: Conditional Diffusion Model for High-Quality and Wide-coverage + CJK Character Generation + + +
+ Chinese, Japanese, and Korean (CJK), with a vast number of native speakers, +has profound influence on society and culture. The typesetting of CJK languages +carries a wide range of requirements due to the complexity of their scripts and +unique literary traditions. A critical aspect of this typesetting process is +that CJK fonts need to provide a set of consistent-looking glyphs for +approximately one hundred thousand characters. However, creating such a font is +inherently labor-intensive and expensive, which significantly hampers the +development of new CJK fonts for typesetting, historical, aesthetic, or +artistic purposes. + To bridge this gap, we are motivated by recent advancements in +diffusion-based generative models and propose a novel diffusion method for +generating glyphs in a targeted style from a \emph{single} conditioned, +standard glyph form. Our experiments show that our method is capable of +generating fonts of both printed and hand-written styles, the latter of which +presents a greater challenge. Moreover, our approach shows remarkable zero-shot +generalization capabilities for non-CJK but Chinese-inspired scripts. We also +show our method facilitates smooth style interpolation and generates bitmap +images suitable for vectorization, which is crucial in the font creation +process. In summary, our proposed method opens the door to high-quality, +generative model-assisted font creation for CJK characters, for both +typesetting and artistic endeavors. + +
+
+
+
+
+ + ☆ Multi-level Graph Subspace Contrastive Learning for Hyperspectral Image + Clustering + + +
+ Hyperspectral image (HSI) clustering is a challenging task due to its high +complexity. Despite subspace clustering shows impressive performance for HSI, +traditional methods tend to ignore the global-local interaction in HSI data. In +this study, we proposed a multi-level graph subspace contrastive learning +(MLGSC) for HSI clustering. The model is divided into the following main parts. +Graph convolution subspace construction: utilizing spectral and texture +feautures to construct two graph convolution views. Local-global graph +representation: local graph representations were obtained by step-by-step +convolutions and a more representative global graph representation was obtained +using an attention-based pooling strategy. Multi-level graph subspace +contrastive learning: multi-level contrastive learning was conducted to obtain +local-global joint graph representations, to improve the consistency of the +positive samples between views, and to obtain more robust graph embeddings. +Specifically, graph-level contrastive learning is used to better learn global +representations of HSI data. Node-level intra-view and inter-view contrastive +learning is designed to learn joint representations of local regions of HSI. +The proposed model is evaluated on four popular HSI datasets: Indian Pines, +Pavia University, Houston, and Xu Zhou. The overall accuracies are 97.75%, +99.96%, 92.28%, and 95.73%, which significantly outperforms the current +state-of-the-art clustering methods. + +
+
+ comment: IJCNN 2024 +
+
+
+
+
+ + ☆ Bidirectional Long-Range Parser for Sequential Data Understanding + + +
+ The transformer is a powerful data modelling framework responsible for +remarkable performance on a wide range of tasks. However, they are limited in +terms of scalability as it is suboptimal and inefficient to process +long-sequence data. To this purpose we introduce BLRP (Bidirectional Long-Range +Parser), a novel and versatile attention mechanism designed to increase +performance and efficiency on long-sequence tasks. It leverages short and long +range heuristics in the form of a local sliding window approach combined with a +global bidirectional latent space synthesis technique. We show the benefits and +versatility of our approach on vision and language domains by demonstrating +competitive results against state-of-the-art methods on the Long-Range-Arena +and CIFAR benchmarks together with ablations demonstrating the computational +efficiency. + +
+
+
+
+
+ + ☆ iVPT: Improving Task-relevant Information Sharing in Visual Prompt + Tuning by Cross-layer Dynamic Connection + + +
+ Recent progress has shown great potential of visual prompt tuning (VPT) when +adapting pre-trained vision transformers to various downstream tasks. However, +most existing solutions independently optimize prompts at each layer, thereby +neglecting the usage of task-relevant information encoded in prompt tokens +across layers. Additionally, existing prompt structures are prone to +interference from task-irrelevant noise in input images, which can do harm to +the sharing of task-relevant information. In this paper, we propose a novel VPT +approach, \textbf{iVPT}. It innovatively incorporates a cross-layer dynamic +connection (CDC) for input prompt tokens from adjacent layers, enabling +effective sharing of task-relevant information. Furthermore, we design a +dynamic aggregation (DA) module that facilitates selective sharing of +information between layers. The combination of CDC and DA enhances the +flexibility of the attention process within the VPT framework. Building upon +these foundations, iVPT introduces an attentive reinforcement (AR) mechanism, +by automatically identifying salient image tokens, which are further enhanced +by prompt tokens in an additive manner. Extensive experiments on 24 image +classification and semantic segmentation benchmarks clearly demonstrate the +advantage of the proposed iVPT, compared to the state-of-the-art counterparts. + +
+
+
+
+
+ + ☆ SoundingActions: Learning How Actions Sound from Narrated Egocentric + Videos CVPR 2024 + + +
+ We propose a novel self-supervised embedding to learn how actions sound from +narrated in-the-wild egocentric videos. Whereas existing methods rely on +curated data with known audio-visual correspondence, our multimodal +contrastive-consensus coding (MC3) embedding reinforces the associations +between audio, language, and vision when all modality pairs agree, while +diminishing those associations when any one pair does not. We show our approach +can successfully discover how the long tail of human actions sound from +egocentric video, outperforming an array of recent multimodal embedding +techniques on two datasets (Ego4D and EPIC-Sounds) and multiple cross-modal +tasks. + +
+
+ comment: Accepted at CVPR 2024. Project page: + https://vision.cs.utexas.edu/projects/soundingactions +
+
+
+
+
+ + ☆ A secure and private ensemble matcher using multi-vault obfuscated + templates + + +
+ Given the irrevocability of biometric samples and mounting privacy concerns, +biometric template security and secure matching are among the essential +features of any well-designed modern biometric system. In this paper, we +propose an obfuscation method that hides the biometric template information +with just enough chaff. The main idea is to reduce the number of chaff points +to a practical level by creating n sub-templates from the original template and +hiding each sub-template with m chaff points. During verification, s closest +vectors to the biometric query are retrieved from each vault and then combined +to generate hash values that are compared with the stored hash value. We +demonstrate the effectiveness of synthetic facial images, generated by a +Generative Adversarial Network (GAN), as ``random chaff points'' within a +secure-vault authorization system. This approach safeguards user identities +during training and deployment. We tested our protocol using the AT&T, GT, and +LFW face datasets, with the ROC areas under the curve being 0.99, 0.99, and +0.90, respectively. These numbers were close to those of the unprotected +templates, showing that our method does not adversely affect accuracy. + +
+
+
+
+
+ + ☆ HSViT: Horizontally Scalable Vision Transformer + + +
+ While the Vision Transformer (ViT) architecture gains prominence in computer +vision and attracts significant attention from multimedia communities, its +deficiency in prior knowledge (inductive bias) regarding shift, scale, and +rotational invariance necessitates pre-training on large-scale datasets. +Furthermore, the growing layers and parameters in both ViT and convolutional +neural networks (CNNs) impede their applicability to mobile multimedia +services, primarily owing to the constrained computational resources on edge +devices. To mitigate the aforementioned challenges, this paper introduces a +novel horizontally scalable vision transformer (HSViT). Specifically, a novel +image-level feature embedding allows ViT to better leverage the inductive bias +inherent in the convolutional layers. Based on this, an innovative horizontally +scalable architecture is designed, which reduces the number of layers and +parameters of the models while facilitating collaborative training and +inference of ViT models across multiple nodes. The experimental results depict +that, without pre-training on large-scale datasets, HSViT achieves up to 10% +higher top-1 accuracy than state-of-the-art schemes, ascertaining its superior +preservation of inductive bias. The code is available at +https://github.com/xuchenhao001/HSViT. + +
+
+
+
+
+ + ☆ LGSDF: Continual Global Learning of Signed Distance Fields Aided by + Local Updating + + +
+ Implicit reconstruction of ESDF (Euclidean Signed Distance Field) involves +training a neural network to regress the signed distance from any point to the +nearest obstacle, which has the advantages of lightweight storage and +continuous querying. However, existing algorithms usually rely on conflicting +raw observations as training data, resulting in poor map performance. In this +paper, we propose LGSDF, an ESDF continual Global learning algorithm aided by +Local updating. At the front end, axis-aligned grids are dynamically updated by +pre-processed sensor observations, where incremental fusion alleviates +estimation error caused by limited viewing directions. At the back end, a +randomly initialized implicit ESDF neural network performs continual +self-supervised learning guided by these grids to generate smooth and +continuous maps. The results on multiple scenes show that LGSDF can construct +more accurate ESDF maps and meshes compared with SOTA (State Of The Art) +explicit and implicit mapping algorithms. The source code of LGSDF is publicly +available at https://github.com/BIT-DYN/LGSDF. + +
+
+
+
+
+ + ☆ Progressive Alignment with VLM-LLM Feature to Augment Defect + Classification for the ASE Dataset + + +
+ Traditional defect classification approaches are facing with two barriers. +(1) Insufficient training data and unstable data quality. Collecting sufficient +defective sample is expensive and time-costing, consequently leading to dataset +variance. It introduces the difficulty on recognition and learning. (2) +Over-dependence on visual modality. When the image pattern and texture is +monotonic for all defect classes in a given dataset, the performance of +conventional AOI system cannot be guaranteed. In scenarios where image quality +is compromised due to mechanical failures or when defect information is +inherently difficult to discern, the performance of deep models cannot be +guaranteed. A main question is, "how to solve those two problems when they +occur at the same time?" The feasible strategy is to explore another feature +within dataset and combine an eminent vision-language model (VLM) and +Large-Language model (LLM) with their astonishing zero-shot capability. In this +work, we propose the special ASE dataset, including rich data description +recorded on image, for defect classification, but the defect feature is uneasy +to learn directly. Secondly, We present the prompting for VLM-LLM against +defect classification with the proposed ASE dataset to activate extra-modality +feature from images to enhance performance. Then, We design the novel +progressive feature alignment (PFA) block to refine image-text feature to +alleviate the difficulty of alignment under few-shot scenario. Finally, the +proposed Cross-modality attention fusion (CMAF) module can effectively fuse +different modality feature. Experiment results have demonstrated our method's +effectiveness over several defect classification methods for the ASE dataset. + +
+
+ comment: MULA 2024 +
+
+
+
+
+ + ☆ Adaptive Learning for Multi-view Stereo Reconstruction + + +
+ Deep learning has recently demonstrated its excellent performance on the task +of multi-view stereo (MVS). However, loss functions applied for deep MVS are +rarely studied. In this paper, we first analyze existing loss functions' +properties for deep depth based MVS approaches. Regression based loss leads to +inaccurate continuous results by computing mathematical expectation, while +classification based loss outputs discretized depth values. To this end, we +then propose a novel loss function, named adaptive Wasserstein loss, which is +able to narrow down the difference between the true and predicted probability +distributions of depth. Besides, a simple but effective offset module is +introduced to better achieve sub-pixel prediction accuracy. Extensive +experiments on different benchmarks, including DTU, Tanks and Temples and +BlendedMVS, show that the proposed method with the adaptive Wasserstein loss +and the offset module achieves state-of-the-art performance. + +
+
+
+
+
+ + ☆ GloSoFarID: Global multispectral dataset for Solar Farm IDentification + in satellite imagery + + +
+ Solar Photovoltaic (PV) technology is increasingly recognized as a pivotal +solution in the global pursuit of clean and renewable energy. This technology +addresses the urgent need for sustainable energy alternatives by converting +solar power into electricity without greenhouse gas emissions. It not only +curtails global carbon emissions but also reduces reliance on finite, +non-renewable energy sources. In this context, monitoring solar panel farms +becomes essential for understanding and facilitating the worldwide shift toward +clean energy. This study contributes to this effort by developing the first +comprehensive global dataset of multispectral satellite imagery of solar panel +farms. This dataset is intended to form the basis for training robust machine +learning models, which can accurately map and analyze the expansion and +distribution of solar panel farms globally. The insights gained from this +endeavor will be instrumental in guiding informed decision-making for a +sustainable energy future. https://github.com/yzyly1992/GloSoFarID + +
+
+
+
+
+ + ☆ QMix: Quality-aware Learning with Mixed Noise for Robust Retinal Disease + Diagnosis + + +
+ Due to the complexity of medical image acquisition and the difficulty of +annotation, medical image datasets inevitably contain noise. Noisy data with +wrong labels affects the robustness and generalization ability of deep neural +networks. Previous noise learning methods mainly considered noise arising from +images being mislabeled, i.e. label noise, assuming that all mislabeled images +are of high image quality. However, medical images are prone to suffering +extreme quality issues, i.e. data noise, where discriminative visual features +are missing for disease diagnosis. In this paper, we propose a noise learning +framework, termed as QMix, that learns a robust disease diagnosis model under +mixed noise. QMix alternates between sample separation and quality-aware +semisupervised training in each training epoch. In the sample separation phase, +we design a joint uncertainty-loss criterion to effectively separate (1) +correctly labeled images; (2) mislabeled images with high quality and (3) +mislabeled images with low quality. In the semi-supervised training phase, we +train a disease diagnosis model to learn robust feature representation from the +separated samples. Specifically, we devise a sample-reweighing loss to mitigate +the effect of mislabeled images with low quality during training. Meanwhile, a +contrastive enhancement loss is proposed to further distinguish mislabeled +images with low quality from correctly labeled images. QMix achieved +state-of-the-art disease diagnosis performance on five public retinal image +datasets and exhibited substantial improvement on robustness against mixed +noise. + +
+
+
+
+
+ + ☆ Semantic Flow: Learning Semantic Field of Dynamic Scenes from Monocular + Videos ICLR 2024 + + +
+ In this work, we pioneer Semantic Flow, a neural semantic representation of +dynamic scenes from monocular videos. In contrast to previous NeRF methods that +reconstruct dynamic scenes from the colors and volume densities of individual +points, Semantic Flow learns semantics from continuous flows that contain rich +3D motion information. As there is 2D-to-3D ambiguity problem in the viewing +direction when extracting 3D flow features from 2D video frames, we consider +the volume densities as opacity priors that describe the contributions of flow +features to the semantics on the frames. More specifically, we first learn a +flow network to predict flows in the dynamic scene, and propose a flow feature +aggregation module to extract flow features from video frames. Then, we propose +a flow attention module to extract motion information from flow features, which +is followed by a semantic network to output semantic logits of flows. We +integrate the logits with volume densities in the viewing direction to +supervise the flow features with semantic labels on video frames. Experimental +results show that our model is able to learn from multiple dynamic scenes and +supports a series of new tasks such as instance-level scene editing, semantic +completions, dynamic scene tracking and semantic adaption on novel scenes. +Codes are available at https://github.com/tianfr/Semantic-Flow/. + +
+
+ comment: Accepted by ICLR 2024, Codes are available at + https://github.com/tianfr/Semantic-Flow/ +
+
+
+
+
+ + ☆ UniMix: Towards Domain Adaptive and Generalizable LiDAR Semantic + Segmentation in Adverse Weather CVPR 2024 + + +
+ LiDAR semantic segmentation (LSS) is a critical task in autonomous driving +and has achieved promising progress. However, prior LSS methods are +conventionally investigated and evaluated on datasets within the same domain in +clear weather. The robustness of LSS models in unseen scenes and all weather +conditions is crucial for ensuring safety and reliability in real applications. +To this end, we propose UniMix, a universal method that enhances the +adaptability and generalizability of LSS models. UniMix first leverages +physically valid adverse weather simulation to construct a Bridge Domain, which +serves to bridge the domain gap between the clear weather scenes and the +adverse weather scenes. Then, a Universal Mixing operator is defined regarding +spatial, intensity, and semantic distributions to create the intermediate +domain with mixed samples from given domains. Integrating the proposed two +techniques into a teacher-student framework, UniMix efficiently mitigates the +domain gap and enables LSS models to learn weather-robust and domain-invariant +representations. We devote UniMix to two main setups: 1) unsupervised domain +adaption, adapting the model from the clear weather source domain to the +adverse weather target domain; 2) domain generalization, learning a model that +generalizes well to unseen scenes in adverse weather. Extensive experiments +validate the effectiveness of UniMix across different tasks and datasets, all +achieving superior performance over state-of-the-art methods. The code will be +released. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Enhancing Clinical Efficiency through LLM: Discharge Note Generation for + Cardiac Patients + + +
+ Medical documentation, including discharge notes, is crucial for ensuring +patient care quality, continuity, and effective medical communication. However, +the manual creation of these documents is not only time-consuming but also +prone to inconsistencies and potential errors. The automation of this +documentation process using artificial intelligence (AI) represents a promising +area of innovation in healthcare. This study directly addresses the +inefficiencies and inaccuracies in creating discharge notes manually, +particularly for cardiac patients, by employing AI techniques, specifically +large language model (LLM). Utilizing a substantial dataset from a cardiology +center, encompassing wide-ranging medical records and physician assessments, +our research evaluates the capability of LLM to enhance the documentation +process. Among the various models assessed, Mistral-7B distinguished itself by +accurately generating discharge notes that significantly improve both +documentation efficiency and the continuity of care for patients. These notes +underwent rigorous qualitative evaluation by medical expert, receiving high +marks for their clinical relevance, completeness, readability, and contribution +to informed decision-making and care planning. Coupled with quantitative +analyses, these results confirm Mistral-7B's efficacy in distilling complex +medical information into concise, coherent summaries. Overall, our findings +illuminate the considerable promise of specialized LLM, such as Mistral-7B, in +refining healthcare documentation workflows and advancing patient care. This +study lays the groundwork for further integrating advanced AI technologies in +healthcare, demonstrating their potential to revolutionize patient +documentation and support better care outcomes. + +
+
+ comment: 10 pages, 1 figure, 3 tables, conference +
+
+
+
+
+ + ☆ Better Monocular 3D Detectors with LiDAR from the Past ICRA 2022 + + +
+ Accurate 3D object detection is crucial to autonomous driving. Though +LiDAR-based detectors have achieved impressive performance, the high cost of +LiDAR sensors precludes their widespread adoption in affordable vehicles. +Camera-based detectors are cheaper alternatives but often suffer inferior +performance compared to their LiDAR-based counterparts due to inherent depth +ambiguities in images. In this work, we seek to improve monocular 3D detectors +by leveraging unlabeled historical LiDAR data. Specifically, at inference time, +we assume that the camera-based detectors have access to multiple unlabeled +LiDAR scans from past traversals at locations of interest (potentially from +other high-end vehicles equipped with LiDAR sensors). Under this setup, we +proposed a novel, simple, and end-to-end trainable framework, termed +AsyncDepth, to effectively extract relevant features from asynchronous LiDAR +traversals of the same location for monocular 3D detectors. We show consistent +and significant performance gain (up to 9 AP) across multiple state-of-the-art +models and datasets with a negligible additional latency of 9.66 ms and a small +storage cost. + +
+
+ comment: Accepted by ICRA 2022. The code can be found at + https://github.com/YurongYou/AsyncDepth +
+
+
+
+
+ + ☆ Self-Supervised Multi-Object Tracking with Path Consistency CVPR 2024 + + +
+ In this paper, we propose a novel concept of path consistency to learn robust +object matching without using manual object identity supervision. Our key idea +is that, to track a object through frames, we can obtain multiple different +association results from a model by varying the frames it can observe, i.e., +skipping frames in observation. As the differences in observations do not alter +the identities of objects, the obtained association results should be +consistent. Based on this rationale, we generate multiple observation paths, +each specifying a different set of frames to be skipped, and formulate the Path +Consistency Loss that enforces the association results are consistent across +different observation paths. We use the proposed loss to train our object +matching model with only self-supervision. By extensive experiments on three +tracking datasets (MOT17, PersonPath22, KITTI), we demonstrate that our method +outperforms existing unsupervised methods with consistent margins on various +evaluation metrics, and even achieves performance close to supervised methods. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Image-based Agarwood Resinous Area Segmentation using Deep Learning + + +
+ The manual extraction method of Agarwood resinous compound is laborious work, +requires skilled workers, and is subject to human errors. Commercial Agarwood +industries have been actively exploring using Computer Numerical Control (CNC) +machines to replace human effort for this particular task. The CNC machine +accepts a G-code script produced from a binary image in which the wood region +that needs to be chiselled off is marked with (0, 0, 0) as its RGB value. +Rather than requiring a human expert to perform the region marking, we propose +using a Deep learning image segmentation method instead. Our setup involves a +camera that captures the cross-section image and then passes the image file to +a computer. The computer performs the automated image segmentation and feeds +the CNC machine with a G-code script. In this article, we report the initial +segmentation results achieved using a state-of-the-art Deep learning +segmentation method and discuss potential improvements to refine the +segmentation accuracy. + +
+
+ comment: 15 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ Improving Deep Learning Predictions with Simulated Images, and Vice + Versa + + +
+ Artificial neural networks are often used to identify features of crop +plants. However, training their models requires many annotated images, which +can be expensive and time-consuming to acquire. Procedural models of plants, +such as those developed with Lindenmayer-systems (L-systems) can be created to +produce visually realistic simulations, and hence images of plant simulations, +where annotations are implicitly known. These synthetic images can either +augment or completely replace real images in training neural networks for +phenotyping tasks. In this paper, we systematically vary amounts of real and +synthetic images used for training in both maize and canola to better +understand situations where synthetic images generated from L-systems can help +prediction on real images. This work also explores the degree to which realism +in the synthetic images improves prediction. Furthermore, we see how neural +network predictions can be used to help calibrate L-systems themselves, +creating a feedback loop. + +
+
+
+
+
+ + ☆ Class Similarity Transition: Decoupling Class Similarities and Imbalance + from Generalized Few-shot Segmentation + + +
+ In Generalized Few-shot Segmentation (GFSS), a model is trained with a large +corpus of base class samples and then adapted on limited samples of novel +classes. This paper focuses on the relevance between base and novel classes, +and improves GFSS in two aspects: 1) mining the similarity between base and +novel classes to promote the learning of novel classes, and 2) mitigating the +class imbalance issue caused by the volume difference between the support set +and the training set. Specifically, we first propose a similarity transition +matrix to guide the learning of novel classes with base class knowledge. Then, +we leverage the Label-Distribution-Aware Margin (LDAM) loss and Transductive +Inference to the GFSS task to address the problem of class imbalance as well as +overfitting the support set. In addition, by extending the probability +transition matrix, the proposed method can mitigate the catastrophic forgetting +of base classes when learning novel classes. With a simple training phase, our +proposed method can be applied to any segmentation network trained on base +classes. We validated our methods on the adapted version of OpenEarthMap. +Compared to existing GFSS baselines, our method excels them all from 3% to 7% +and ranks second in the OpenEarthMap Land Cover Mapping Few-Shot Challenge at +the completion of this paper. Code: +https://github.com/earth-insights/ClassTrans + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Energy-Calibrated VAE with Test Time Free Lunch + + +
+ In this paper, we propose a novel generative model that utilizes a +conditional Energy-Based Model (EBM) for enhancing Variational Autoencoder +(VAE), termed Energy-Calibrated VAE (EC-VAE). Specifically, VAEs often suffer +from blurry generated samples due to the lack of a tailored training on the +samples generated in the generative direction. On the other hand, EBMs can +generate high-quality samples but require expensive Markov Chain Monte Carlo +(MCMC) sampling. To address these issues, we introduce a conditional EBM for +calibrating the generative direction of VAE during training, without requiring +it for the generation at test time. In particular, we train EC-VAE upon both +the input data and the calibrated samples with adaptive weight to enhance +efficacy while avoiding MCMC sampling at test time. Furthermore, we extend the +calibration idea of EC-VAE to variational learning and normalizing flows, and +apply EC-VAE to an additional application of zero-shot image restoration via +neural transport prior and range-null theory. We evaluate the proposed method +with two applications, including image generation and zero-shot image +restoration, and the experimental results show that our method achieves +competitive performance over single-step non-adversarial generation. Our code +is available at https://github.com/DJ-LYH/EC-VAE. + +
+
+ comment: Revision. Code is available at https://github.com/DJ-LYH/EC-VAE +
+
+
+
+
+ + ♻ ☆ Deep Internal Learning: Deep Learning from a Single Input + + +
+ Deep learning, in general, focuses on training a neural network from large +labeled datasets. Yet, in many cases there is value in training a network just +from the input at hand. This is particularly relevant in many signal and image +processing problems where training data is scarce and diversity is large on the +one hand, and on the other, there is a lot of structure in the data that can be +exploited. Using this information is the key to deep internal-learning +strategies, which may involve training a network from scratch using a single +input or adapting an already trained network to a provided input example at +inference time. This survey paper aims at covering deep internal-learning +techniques that have been proposed in the past few years for these two +important directions. While our main focus will be on image processing +problems, most of the approaches that we survey are derived for general signals +(vectors with recurring patterns that can be distinguished from noise) and are +therefore applicable to other modalities. + +
+
+ comment: Accepted to IEEE Signal Processing Magazine +
+
+
+
+
+ + ♻ ☆ FreGS: 3D Gaussian Splatting with Progressive Frequency Regularization CVPR 2024 + + +
+ 3D Gaussian splatting has achieved very impressive performance in real-time +novel view synthesis. However, it often suffers from over-reconstruction during +Gaussian densification where high-variance image regions are covered by a few +large Gaussians only, leading to blur and artifacts in the rendered images. We +design a progressive frequency regularization (FreGS) technique to tackle the +over-reconstruction issue within the frequency space. Specifically, FreGS +performs coarse-to-fine Gaussian densification by exploiting low-to-high +frequency components that can be easily extracted with low-pass and high-pass +filters in the Fourier space. By minimizing the discrepancy between the +frequency spectrum of the rendered image and the corresponding ground truth, it +achieves high-quality Gaussian densification and alleviates the +over-reconstruction of Gaussian splatting effectively. Experiments over +multiple widely adopted benchmarks (e.g., Mip-NeRF360, Tanks-and-Temples and +Deep Blending) show that FreGS achieves superior novel view synthesis and +outperforms the state-of-the-art consistently. + +
+
+ comment: Accepted by CVPR 2024. Project website: + https://rogeraigc.github.io/FreGS-Page/ +
+
+
+
+
+ + ♻ ☆ WEEP: A method for spatial interpretation of weakly supervised CNN + models in computational pathology + + +
+ Deep learning enables the modelling of high-resolution histopathology +whole-slide images (WSI). Weakly supervised learning of tile-level data is +typically applied for tasks where labels only exist on the patient or WSI level +(e.g. patient outcomes or histological grading). In this context, there is a +need for improved spatial interpretability of predictions from such models. We +propose a novel method, Wsi rEgion sElection aPproach (WEEP), for model +interpretation. It provides a principled yet straightforward way to establish +the spatial area of WSI required for assigning a particular prediction label. +We demonstrate WEEP on a binary classification task in the area of breast +cancer computational pathology. WEEP is easy to implement, is directly +connected to the model-based decision process, and offers information relevant +to both research and diagnostic applications. + +
+
+
+
+
+ + ♻ ☆ Robust Human Motion Forecasting using Transformer-based Model IROS2022 + + +
+ Comprehending human motion is a fundamental challenge for developing +Human-Robot Collaborative applications. Computer vision researchers have +addressed this field by only focusing on reducing error in predictions, but not +taking into account the requirements to facilitate its implementation in +robots. In this paper, we propose a new model based on Transformer that +simultaneously deals with the real time 3D human motion forecasting in the +short and long term. Our 2-Channel Transformer (2CH-TR) is able to efficiently +exploit the spatio-temporal information of a shortly observed sequence (400ms) +and generates a competitive accuracy against the current state-of-the-art. +2CH-TR stands out for the efficient performance of the Transformer, being +lighter and faster than its competitors. In addition, our model is tested in +conditions where the human motion is severely occluded, demonstrating its +robustness in reconstructing and predicting 3D human motion in a highly noisy +environment. Our experiment results show that the proposed 2CH-TR outperforms +the ST-Transformer, which is another state-of-the-art model based on the +Transformer, in terms of reconstruction and prediction under the same +conditions of input prefix. Our model reduces in 8.89% the mean squared error +of ST-Transformer in short-term prediction, and 2.57% in long-term prediction +in Human3.6M dataset with 400ms input prefix. Webpage: +https://evm7.github.io/2CHTR-page/ + +
+
+ comment: Accepted to IROS2022. Webpage: https://evm7.github.io/2CHTR-page/ +
+
+
+
+
+ + ♻ ☆ A Unified Masked Autoencoder with Patchified Skeletons for Motion + Synthesis + + +
+ The synthesis of human motion has traditionally been addressed through +task-dependent models that focus on specific challenges, such as predicting +future motions or filling in intermediate poses conditioned on known key-poses. +In this paper, we present a novel task-independent model called UNIMASK-M, +which can effectively address these challenges using a unified architecture. +Our model obtains comparable or better performance than the state-of-the-art in +each field. Inspired by Vision Transformers (ViTs), our UNIMASK-M model +decomposes a human pose into body parts to leverage the spatio-temporal +relationships existing in human motion. Moreover, we reformulate various +pose-conditioned motion synthesis tasks as a reconstruction problem with +different masking patterns given as input. By explicitly informing our model +about the masked joints, our UNIMASK-M becomes more robust to occlusions. +Experimental results show that our model successfully forecasts human motion on +the Human3.6M dataset. Moreover, it achieves state-of-the-art results in motion +inbetweening on the LaFAN1 dataset, particularly in long transition periods. +More information can be found on the project website +https://evm7.github.io/UNIMASKM-page/ + +
+
+ comment: Accepted to AAAI2024. Webpage: https://evm7.github.io/UNIMASKM-page/ +
+
+
+
+
+ + ♻ ☆ HOI4ABOT: Human-Object Interaction Anticipation for Human Intention + Reading Collaborative roBOTs + + +
+ Robots are becoming increasingly integrated into our lives, assisting us in +various tasks. To ensure effective collaboration between humans and robots, it +is essential that they understand our intentions and anticipate our actions. In +this paper, we propose a Human-Object Interaction (HOI) anticipation framework +for collaborative robots. We propose an efficient and robust transformer-based +model to detect and anticipate HOIs from videos. This enhanced anticipation +empowers robots to proactively assist humans, resulting in more efficient and +intuitive collaborations. Our model outperforms state-of-the-art results in HOI +detection and anticipation in VidHOI dataset with an increase of 1.76% and +1.04% in mAP respectively while being 15.4 times faster. We showcase the +effectiveness of our approach through experimental results in a real robot, +demonstrating that the robot's ability to anticipate HOIs is key for better +Human-Robot Interaction. More information can be found on our project webpage: +https://evm7.github.io/HOI4ABOT_page/ + +
+
+ comment: Proceedings in Conference on Robot Learning 2023. Webpage: + https://evm7.github.io/HOI4ABOT_page/ +
+
+
+
+
+ + ♻ ☆ Robot Interaction Behavior Generation based on Social Motion Forecasting + for Human-Robot Interaction ICRA 2024 + + +
+ Integrating robots into populated environments is a complex challenge that +requires an understanding of human social dynamics. In this work, we propose to +model social motion forecasting in a shared human-robot representation space, +which facilitates us to synthesize robot motions that interact with humans in +social scenarios despite not observing any robot in the motion training. We +develop a transformer-based architecture called ECHO, which operates in the +aforementioned shared space to predict the future motions of the agents +encountered in social scenarios. Contrary to prior works, we reformulate the +social motion problem as the refinement of the predicted individual motions +based on the surrounding agents, which facilitates the training while allowing +for single-motion forecasting when only one human is in the scene. We evaluate +our model in multi-person and human-robot motion forecasting tasks and obtain +state-of-the-art performance by a large margin while being efficient and +performing in real-time. Additionally, our qualitative results showcase the +effectiveness of our approach in generating human-robot interaction behaviors +that can be controlled via text commands. Webpage: https://evm7.github.io/ECHO/ + +
+
+ comment: Accepted at ICRA 2024. Webpage: https://evm7.github.io/ECHO/ +
+
+
+
+
+ + ♻ ☆ DRCT: Saving Image Super-resolution away from Information Bottleneck + + +
+ In recent years, Vision Transformer-based applications to low-level vision +tasks have achieved widespread success. Unlike CNN-based models, Transformers +are more adept at capturing long-range dependencies, enabling the +reconstruction of images utilizing information from non-local areas. In the +domain of super-resolution, Swin-transformer-based approaches have become +mainstream due to their capacity to capture global spatial information and +their shifting-window attention mechanism that facilitates the interchange of +information between different windows. Many researchers have enhanced image +quality and network efficiency by expanding the receptive field or designing +complex networks, yielding commendable results. However, we observed that +spatial information tends to diminish during the forward propagation process +due to increased depth, leading to a loss of spatial information and, +consequently, limiting the model's potential. To address this, we propose the +Dense-residual-connected Transformer (DRCT), aimed at mitigating the loss of +spatial information through dense-residual connections between layers, thereby +unleashing the model's potential and enhancing performance. Experiment results +indicate that our approach is not only straightforward but also achieves +remarkable efficiency, surpassing state-of-the-art methods and performing +commendably at NTIRE2024. + +
+
+ comment: NTIRE 2024 Image Super-resolution (x4) +
+
+
+
+
+ + ♻ ☆ MESA: Matching Everything by Segmenting Anything CVPR24 + + +
+ Feature matching is a crucial task in the field of computer vision, which +involves finding correspondences between images. Previous studies achieve +remarkable performance using learning-based feature comparison. However, the +pervasive presence of matching redundancy between images gives rise to +unnecessary and error-prone computations in these methods, imposing limitations +on their accuracy. To address this issue, we propose MESA, a novel approach to +establish precise area (or region) matches for efficient matching redundancy +reduction. MESA first leverages the advanced image understanding capability of +SAM, a state-of-the-art foundation model for image segmentation, to obtain +image areas with implicit semantic. Then, a multi-relational graph is proposed +to model the spatial structure of these areas and construct their scale +hierarchy. Based on graphical models derived from the graph, the area matching +is reformulated as an energy minimization task and effectively resolved. +Extensive experiments demonstrate that MESA yields substantial precision +improvement for multiple point matchers in indoor and outdoor downstream tasks, +e.g. +13.61% for DKM in indoor pose estimation. + +
+
+ comment: CVPR24 +
+
+
+
+
+ + ♻ ☆ DPHMs: Diffusion Parametric Head Models for Depth-based Tracking CVPR 2024 + + +
+ We introduce Diffusion Parametric Head Models (DPHMs), a generative model +that enables robust volumetric head reconstruction and tracking from monocular +depth sequences. While recent volumetric head models, such as NPHMs, can now +excel in representing high-fidelity head geometries, tracking and +reconstructing heads from real-world single-view depth sequences remains very +challenging, as the fitting to partial and noisy observations is +underconstrained. To tackle these challenges, we propose a latent +diffusion-based prior to regularize volumetric head reconstruction and +tracking. This prior-based regularizer effectively constrains the identity and +expression codes to lie on the underlying latent manifold which represents +plausible head shapes. To evaluate the effectiveness of the diffusion-based +prior, we collect a dataset of monocular Kinect sequences consisting of various +complex facial expression motions and rapid transitions. We compare our method +to state-of-the-art tracking methods and demonstrate improved head identity +reconstruction as well as robust expression tracking. + +
+
+ comment: CVPR 2024; homepage: https://tangjiapeng.github.io/projects/DPHMs/ +
+
+
+
+
+ + ♻ ☆ SepVAE: a contrastive VAE to separate pathological patterns from healthy + ones ICML + + +
+ Contrastive Analysis VAE (CA-VAEs) is a family of Variational auto-encoders +(VAEs) that aims at separating the common factors of variation between a +background dataset (BG) (i.e., healthy subjects) and a target dataset (TG) +(i.e., patients) from the ones that only exist in the target dataset. To do so, +these methods separate the latent space into a set of salient features (i.e., +proper to the target dataset) and a set of common features (i.e., exist in both +datasets). Currently, all models fail to prevent the sharing of information +between latent spaces effectively and to capture all salient factors of +variation. To this end, we introduce two crucial regularization losses: a +disentangling term between common and salient representations and a +classification term between background and target samples in the salient space. +We show a better performance than previous CA-VAEs methods on three medical +applications and a natural images dataset (CelebA). Code and datasets are +available on GitHub https://github.com/neurospin-projects/2023_rlouiset_sepvae. + +
+
+ comment: Workshop on Interpretable ML in Healthcare at International + Conference on Machine Learning (ICML), Honolulu, Hawaii, USA. 2023 +
+
+
+
+
+ + ♻ ☆ SiT-MLP: A Simple MLP with Point-wise Topology Feature Learning for + Skeleton-based Action Recognition + + +
+ Graph convolution networks (GCNs) have achieved remarkable performance in +skeleton-based action recognition. However, previous GCN-based methods rely on +elaborate human priors excessively and construct complex feature aggregation +mechanisms, which limits the generalizability and effectiveness of networks. To +solve these problems, we propose a novel Spatial Topology Gating Unit (STGU), +an MLP-based variant without extra priors, to capture the co-occurrence +topology features that encode the spatial dependency across all joints. In +STGU, to learn the point-wise topology features, a new gate-based feature +interaction mechanism is introduced to activate the features point-to-point by +the attention map generated from the input sample. Based on the STGU, we +propose the first MLP-based model, SiT-MLP, for skeleton-based action +recognition in this work. Compared with previous methods on three large-scale +datasets, SiT-MLP achieves competitive performance. In addition, SiT-MLP +reduces the parameters significantly with favorable results. The code will be +available at https://github.com/BUPTSJZhang/SiT?MLP. + +
+
+ comment: Accepted by IEEE TCSVT 2024 +
+
+
+
+
+ + ♻ ☆ RTMO: Towards High-Performance One-Stage Real-Time Multi-Person Pose + Estimation CVPR 2024 + + +
+ Real-time multi-person pose estimation presents significant challenges in +balancing speed and precision. While two-stage top-down methods slow down as +the number of people in the image increases, existing one-stage methods often +fail to simultaneously deliver high accuracy and real-time performance. This +paper introduces RTMO, a one-stage pose estimation framework that seamlessly +integrates coordinate classification by representing keypoints using dual 1-D +heatmaps within the YOLO architecture, achieving accuracy comparable to +top-down methods while maintaining high speed. We propose a dynamic coordinate +classifier and a tailored loss function for heatmap learning, specifically +designed to address the incompatibilities between coordinate classification and +dense prediction models. RTMO outperforms state-of-the-art one-stage pose +estimators, achieving 1.1% higher AP on COCO while operating about 9 times +faster with the same backbone. Our largest model, RTMO-l, attains 74.8% AP on +COCO val2017 and 141 FPS on a single V100 GPU, demonstrating its efficiency and +accuracy. The code and models are available at +https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo. + +
+
+ comment: Accepted at CVPR 2024. Project page: + https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo +
+
+
+
+
+ + ♻ ☆ Zero-Shot Segmentation of Eye Features Using the Segment Anything Model + (SAM) + + +
+ The advent of foundation models signals a new era in artificial intelligence. +The Segment Anything Model (SAM) is the first foundation model for image +segmentation. In this study, we evaluate SAM's ability to segment features from +eye images recorded in virtual reality setups. The increasing requirement for +annotated eye-image datasets presents a significant opportunity for SAM to +redefine the landscape of data annotation in gaze estimation. Our investigation +centers on SAM's zero-shot learning abilities and the effectiveness of prompts +like bounding boxes or point clicks. Our results are consistent with studies in +other domains, demonstrating that SAM's segmentation effectiveness can be +on-par with specialized models depending on the feature, with prompts improving +its performance, evidenced by an IoU of 93.34% for pupil segmentation in one +dataset. Foundation models like SAM could revolutionize gaze estimation by +enabling quick and easy image segmentation, reducing reliance on specialized +models and extensive manual annotation. + +
+
+ comment: 14 pages, 8 figures, 1 table, Accepted to ETRA 2024: ACM Symposium on + Eye Tracking Research & Applications +
+
+
+
+
+ + ♻ ☆ Photo-SLAM: Real-time Simultaneous Localization and Photorealistic + Mapping for Monocular, Stereo, and RGB-D Cameras CVPR 2024 + + +
+ The integration of neural rendering and the SLAM system recently showed +promising results in joint localization and photorealistic view reconstruction. +However, existing methods, fully relying on implicit representations, are so +resource-hungry that they cannot run on portable devices, which deviates from +the original intention of SLAM. In this paper, we present Photo-SLAM, a novel +SLAM framework with a hyper primitives map. Specifically, we simultaneously +exploit explicit geometric features for localization and learn implicit +photometric features to represent the texture information of the observed +environment. In addition to actively densifying hyper primitives based on +geometric features, we further introduce a Gaussian-Pyramid-based training +method to progressively learn multi-level features, enhancing photorealistic +mapping performance. The extensive experiments with monocular, stereo, and +RGB-D datasets prove that our proposed system Photo-SLAM significantly +outperforms current state-of-the-art SLAM systems for online photorealistic +mapping, e.g., PSNR is 30% higher and rendering speed is hundreds of times +faster in the Replica dataset. Moreover, the Photo-SLAM can run at real-time +speed using an embedded platform such as Jetson AGX Orin, showing the potential +of robotics applications. + +
+
+ comment: CVPR 2024. Code: https://github.com/HuajianUP/Photo-SLAM - Project + Page: https://huajianup.github.io/research/Photo-SLAM/ +
+
+
+
+
+ + ♻ ☆ 360Loc: A Dataset and Benchmark for Omnidirectional Visual Localization + with Cross-device Queries CVPR 2024 + + +
+ Portable 360$^\circ$ cameras are becoming a cheap and efficient tool to +establish large visual databases. By capturing omnidirectional views of a +scene, these cameras could expedite building environment models that are +essential for visual localization. However, such an advantage is often +overlooked due to the lack of valuable datasets. This paper introduces a new +benchmark dataset, 360Loc, composed of 360$^\circ$ images with ground truth +poses for visual localization. We present a practical implementation of +360$^\circ$ mapping combining 360$^\circ$ images with lidar data to generate +the ground truth 6DoF poses. 360Loc is the first dataset and benchmark that +explores the challenge of cross-device visual positioning, involving +360$^\circ$ reference frames, and query frames from pinhole, ultra-wide FoV +fisheye, and 360$^\circ$ cameras. We propose a virtual camera approach to +generate lower-FoV query frames from 360$^\circ$ images, which ensures a fair +comparison of performance among different query types in visual localization +tasks. We also extend this virtual camera approach to feature matching-based +and pose regression-based methods to alleviate the performance loss caused by +the cross-device domain gap, and evaluate its effectiveness against +state-of-the-art baselines. We demonstrate that omnidirectional visual +localization is more robust in challenging large-scale scenes with symmetries +and repetitive structures. These results provide new insights into 360-camera +mapping and omnidirectional visual localization with cross-device queries. + +
+
+ comment: CVPR 2024. Project Page: https://huajianup.github.io/research/360Loc/ +
+
+
+
+
+ + ♻ ☆ Design as Desired: Utilizing Visual Question Answering for Multimodal + Pre-training + + +
+ Multimodal pre-training demonstrates its potential in the medical domain, +which learns medical visual representations from paired medical reports. +However, many pre-training tasks require extra annotations from clinicians, and +most of them fail to explicitly guide the model to learn the desired features +of different pathologies. To the best of our knowledge, we are the first to +utilize Visual Question Answering (VQA) for multimodal pre-training to guide +the framework focusing on targeted pathological features. In this work, we +leverage descriptions in medical reports to design multi-granular +question-answer pairs associated with different diseases, which assist the +framework in pre-training without requiring extra annotations from experts. We +also propose a novel pre-training framework with a quasi-textual feature +transformer, a module designed to transform visual features into a +quasi-textual space closer to the textual domain via a contrastive learning +strategy. This narrows the vision-language gap and facilitates modality +alignment. Our framework is applied to four downstream tasks: report +generation, classification, segmentation, and detection across five datasets. +Extensive experiments demonstrate the superiority of our framework compared to +other state-of-the-art methods. Our code will be released upon acceptance. + +
+
+
+
+
+ + ♻ ☆ LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging CVPR 2024 + + +
+ Human pose and shape (HPS) estimation with lensless imaging is not only +beneficial to privacy protection but also can be used in covert surveillance +scenarios due to the small size and simple structure of this device. However, +this task presents significant challenges due to the inherent ambiguity of the +captured measurements and lacks effective methods for directly estimating human +pose and shape from lensless data. In this paper, we propose the first +end-to-end framework to recover 3D human poses and shapes from lensless +measurements to our knowledge. We specifically design a multi-scale lensless +feature decoder to decode the lensless measurements through the optically +encoded mask for efficient feature extraction. We also propose a double-head +auxiliary supervision mechanism to improve the estimation accuracy of human +limb ends. Besides, we establish a lensless imaging system and verify the +effectiveness of our method on various datasets acquired by our lensless +imaging system. + +
+
+ comment: Accepted to CVPR 2024. More results available at + https://cic.tju.edu.cn/faculty/likun/projects/LPSNet +
+
+
+
+
+ + ♻ ☆ A ground-based dataset and a diffusion model for on-orbit low-light + image enhancement + + +
+ On-orbit service is important for maintaining the sustainability of space +environment. Space-based visible camera is an economical and lightweight sensor +for situation awareness during on-orbit service. However, it can be easily +affected by the low illumination environment. Recently, deep learning has +achieved remarkable success in image enhancement of natural images, but seldom +applied in space due to the data bottleneck. In this article, we first propose +a dataset of the Beidou Navigation Satellite for on-orbit low-light image +enhancement (LLIE). In the automatic data collection scheme, we focus on +reducing domain gap and improving the diversity of the dataset. we collect +hardware in-the-loop images based on a robotic simulation testbed imitating +space lighting conditions. To evenly sample poses of different orientation and +distance without collision, a collision-free working space and pose stratified +sampling is proposed. Afterwards, a novel diffusion model is proposed. To +enhance the image contrast without over-exposure and blurring details, we +design a fused attention to highlight the structure and dark region. Finally, +we compare our method with previous methods using our dataset, which indicates +that our method has a better capacity in on-orbit LLIE. + +
+
+
+
+
+ + ♻ ☆ Representing Noisy Image Without Denoising + + +
+ A long-standing topic in artificial intelligence is the effective recognition +of patterns from noisy images. In this regard, the recent data-driven paradigm +considers 1) improving the representation robustness by adding noisy samples in +training phase (i.e., data augmentation) or 2) pre-processing the noisy image +by learning to solve the inverse problem (i.e., image denoising). However, such +methods generally exhibit inefficient process and unstable result, limiting +their practical applications. In this paper, we explore a non-learning paradigm +that aims to derive robust representation directly from noisy images, without +the denoising as pre-processing. Here, the noise-robust representation is +designed as Fractional-order Moments in Radon space (FMR), with also beneficial +properties of orthogonality and rotation invariance. Unlike earlier +integer-order methods, our work is a more generic design taking such classical +methods as special cases, and the introduced fractional-order parameter offers +time-frequency analysis capability that is not available in classical methods. +Formally, both implicit and explicit paths for constructing the FMR are +discussed in detail. Extensive simulation experiments and an image security +application are provided to demonstrate the uniqueness and usefulness of our +FMR, especially for noise robustness, rotation invariance, and time-frequency +discriminability. + +
+
+ comment: Accepted by IEEE Transactions on Pattern Analysis and Machine + Intelligence, 2024 +
+
+
+
+
+ + ♻ ☆ PEEB: Part-based Image Classifiers with an Explainable and Editable + Language Bottleneck + + +
+ CLIP-based classifiers rely on the prompt containing a {class name} that is +known to the text encoder. Therefore, they perform poorly on new classes or the +classes whose names rarely appear on the Internet (e.g., scientific names of +birds). For fine-grained classification, we propose PEEB - an explainable and +editable classifier to (1) express the class name into a set of text +descriptors that describe the visual parts of that class; and (2) match the +embeddings of the detected parts to their textual descriptors in each class to +compute a logit score for classification. In a zero-shot setting where the +class names are unknown, PEEB outperforms CLIP by a huge margin (~10x in top-1 +accuracy). Compared to part-based classifiers, PEEB is not only the +state-of-the-art (SOTA) on the supervised-learning setting (88.80% and 92.20% +accuracy on CUB-200 and Dogs-120, respectively) but also the first to enable +users to edit the text descriptors to form a new classifier without any +re-training. Compared to concept bottleneck models, PEEB is also the SOTA in +both zero-shot and supervised-learning settings. + +
+
+ comment: Findings of NAACL 2024 (long paper) +
+
+
+
+
+ + ♻ ☆ Swap Attention in Spatiotemporal Diffusions for Text-to-Video Generation + + +
+ With the explosive popularity of AI-generated content (AIGC), video +generation has recently received a lot of attention. Generating videos guided +by text instructions poses significant challenges, such as modeling the complex +relationship between space and time, and the lack of large-scale text-video +paired data. Existing text-video datasets suffer from limitations in both +content quality and scale, or they are not open-source, rendering them +inaccessible for study and use. For model design, previous approaches extend +pretrained text-to-image generation models by adding temporal 1D +convolution/attention modules for video generation. However, these approaches +overlook the importance of jointly modeling space and time, inevitably leading +to temporal distortions and misalignment between texts and videos. In this +paper, we propose a novel approach that strengthens the interaction between +spatial and temporal perceptions. In particular, we utilize a swapped +cross-attention mechanism in 3D windows that alternates the ``query'' role +between spatial and temporal blocks, enabling mutual reinforcement for each +other. Moreover, to fully unlock model capabilities for high-quality video +generation and promote the development of the field, we curate a large-scale +and open-source video dataset called HD-VG-130M. This dataset comprises 130 +million text-video pairs from the open-domain, ensuring high-definition, +widescreen and watermark-free characters. A smaller-scale yet more meticulously +cleaned subset further enhances the data quality, aiding models in achieving +superior performance. Experimental quantitative and qualitative results +demonstrate the superiority of our approach in terms of per-frame quality, +temporal correlation, and text-video alignment, with clear margins. + +
+
+
+
+
+ + ♻ ☆ InstaGen: Enhancing Object Detection by Training on Synthetic Dataset CVPR2024 + + +
+ In this paper, we present a novel paradigm to enhance the ability of object +detector, e.g., expanding categories or improving detection performance, by +training on synthetic dataset generated from diffusion models. Specifically, we +integrate an instance-level grounding head into a pre-trained, generative +diffusion model, to augment it with the ability of localising instances in the +generated images. The grounding head is trained to align the text embedding of +category names with the regional visual feature of the diffusion model, using +supervision from an off-the-shelf object detector, and a novel self-training +scheme on (novel) categories not covered by the detector. We conduct thorough +experiments to show that, this enhanced version of diffusion model, termed as +InstaGen, can serve as a data synthesizer, to enhance object detectors by +training on its generated samples, demonstrating superior performance over +existing state-of-the-art methods in open-vocabulary (+4.5 AP) and data-sparse +(+1.2 to 5.2 AP) scenarios. Project page with code: +https://fcjian.github.io/InstaGen. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ SIFU: Side-view Conditioned Implicit Function for Real-world Usable + Clothed Human Reconstruction CVPR 2024 + + +
+ Creating high-quality 3D models of clothed humans from single images for +real-world applications is crucial. Despite recent advancements, accurately +reconstructing humans in complex poses or with loose clothing from in-the-wild +images, along with predicting textures for unseen areas, remains a significant +challenge. A key limitation of previous methods is their insufficient prior +guidance in transitioning from 2D to 3D and in texture prediction. In response, +we introduce SIFU (Side-view Conditioned Implicit Function for Real-world +Usable Clothed Human Reconstruction), a novel approach combining a Side-view +Decoupling Transformer with a 3D Consistent Texture Refinement pipeline.SIFU +employs a cross-attention mechanism within the transformer, using SMPL-X +normals as queries to effectively decouple side-view features in the process of +mapping 2D features to 3D. This method not only improves the precision of the +3D models but also their robustness, especially when SMPL-X estimates are not +perfect. Our texture refinement process leverages text-to-image diffusion-based +prior to generate realistic and consistent textures for invisible views. +Through extensive experiments, SIFU surpasses SOTA methods in both geometry and +texture reconstruction, showcasing enhanced robustness in complex scenarios and +achieving an unprecedented Chamfer and P2S measurement. Our approach extends to +practical applications such as 3D printing and scene building, demonstrating +its broad utility in real-world scenarios. Project page +https://river-zhang.github.io/SIFU-projectpage/ . + +
+
+ comment: Accepted by CVPR 2024; Project page + https://river-zhang.github.io/SIFU-projectpage/ +
+
+
+
+
+ + ♻ ☆ SAOR: Single-View Articulated Object Reconstruction CVPR 2024 + + +
+ We introduce SAOR, a novel approach for estimating the 3D shape, texture, and +viewpoint of an articulated object from a single image captured in the wild. +Unlike prior approaches that rely on pre-defined category-specific 3D templates +or tailored 3D skeletons, SAOR learns to articulate shapes from single-view +image collections with a skeleton-free part-based model without requiring any +3D object shape priors. To prevent ill-posed solutions, we propose a +cross-instance consistency loss that exploits disentangled object shape +deformation and articulation. This is helped by a new silhouette-based sampling +mechanism to enhance viewpoint diversity during training. Our method only +requires estimated object silhouettes and relative depth maps from +off-the-shelf pre-trained networks during training. At inference time, given a +single-view image, it efficiently outputs an explicit mesh representation. We +obtain improved qualitative and quantitative results on challenging quadruped +animals compared to relevant existing work. + +
+
+ comment: Accepted to CVPR 2024, website: https://mehmetaygun.github.io/saor +
+
+
+
+
+ + ♻ ☆ CA-Jaccard: Camera-aware Jaccard Distance for Person Re-identification CVPR 2024 + + +
+ Person re-identification (re-ID) is a challenging task that aims to learn +discriminative features for person retrieval. In person re-ID, Jaccard distance +is a widely used distance metric, especially in re-ranking and clustering +scenarios. However, we discover that camera variation has a significant +negative impact on the reliability of Jaccard distance. In particular, Jaccard +distance calculates the distance based on the overlap of relevant neighbors. +Due to camera variation, intra-camera samples dominate the relevant neighbors, +which reduces the reliability of the neighbors by introducing intra-camera +negative samples and excluding inter-camera positive samples. To overcome this +problem, we propose a novel camera-aware Jaccard (CA-Jaccard) distance that +leverages camera information to enhance the reliability of Jaccard distance. +Specifically, we design camera-aware k-reciprocal nearest neighbors (CKRNNs) to +find k-reciprocal nearest neighbors on the intra-camera and inter-camera +ranking lists, which improves the reliability of relevant neighbors and +guarantees the contribution of inter-camera samples in the overlap. Moreover, +we propose a camera-aware local query expansion (CLQE) to mine reliable samples +in relevant neighbors by exploiting camera variation as a strong constraint and +assign these samples higher weights in overlap, further improving the +reliability. Our CA-Jaccard distance is simple yet effective and can serve as a +general distance metric for person re-ID methods with high reliability and low +computational cost. Extensive experiments demonstrate the effectiveness of our +method. + +
+
+ comment: This paper is accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SegmentAnything helps microscopy images based automatic and quantitative + organoid detection and analysis + + +
+ Organoids are self-organized 3D cell clusters that closely mimic the +architecture and function of in vivo tissues and organs. Quantification of +organoid morphology helps in studying organ development, drug discovery, and +toxicity assessment. Recent microscopy techniques provide a potent tool to +acquire organoid morphology features, but manual image analysis remains a labor +and time-intensive process. Thus, this paper proposes a comprehensive pipeline +for microscopy analysis that leverages the SegmentAnything to precisely +demarcate individual organoids. Additionally, we introduce a set of +morphological properties, including perimeter, area, radius, non-smoothness, +and non-circularity, allowing researchers to analyze the organoid structures +quantitatively and automatically. To validate the effectiveness of our +approach, we conducted tests on bright-field images of human induced +pluripotent stem cells (iPSCs) derived neural-epithelial (NE) organoids. The +results obtained from our automatic pipeline closely align with manual organoid +detection and measurement, showcasing the capability of our proposed method in +accelerating organoids morphology analysis. + +
+
+ comment: Replace Figure 4 with the correct version. The original version is + wrong due to a column name mismatch +
+
+
+
+
+ + ♻ ☆ Understanding normalization in contrastive representation learning and + out-of-distribution detection + + +
+ Contrastive representation learning has emerged as an outstanding approach +for anomaly detection. In this work, we explore the $\ell_2$-norm of +contrastive features and its applications in out-of-distribution detection. We +propose a simple method based on contrastive learning, which incorporates +out-of-distribution data by discriminating against normal samples in the +contrastive layer space. Our approach can be applied flexibly as an outlier +exposure (OE) approach, where the out-of-distribution data is a huge collective +of random images, or as a fully self-supervised learning approach, where the +out-of-distribution data is self-generated by applying distribution-shifting +transformations. The ability to incorporate additional out-of-distribution +samples enables a feasible solution for datasets where AD methods based on +contrastive learning generally underperform, such as aerial images or +microscopy images. Furthermore, the high-quality features learned through +contrastive learning consistently enhance performance in OE scenarios, even +when the available out-of-distribution dataset is not diverse enough. Our +extensive experiments demonstrate the superiority of our proposed method under +various scenarios, including unimodal and multimodal settings, with various +image datasets. + +
+
+
+
+
+ + ♻ ☆ Confronting Ambiguity in 6D Object Pose Estimation via Score-Based + Diffusion on SE(3) CVPR2024 + + +
+ Addressing pose ambiguity in 6D object pose estimation from single RGB images +presents a significant challenge, particularly due to object symmetries or +occlusions. In response, we introduce a novel score-based diffusion method +applied to the $SE(3)$ group, marking the first application of diffusion models +to $SE(3)$ within the image domain, specifically tailored for pose estimation +tasks. Extensive evaluations demonstrate the method's efficacy in handling pose +ambiguity, mitigating perspective-induced ambiguity, and showcasing the +robustness of our surrogate Stein score formulation on $SE(3)$. This +formulation not only improves the convergence of denoising process but also +enhances computational efficiency. Thus, we pioneer a promising strategy for 6D +object pose estimation. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ Neural Implicit Morphing of Face Images CVPR 2024 + + +
+ Face morphing is a problem in computer graphics with numerous artistic and +forensic applications. It is challenging due to variations in pose, lighting, +gender, and ethnicity. This task consists of a warping for feature alignment +and a blending for a seamless transition between the warped images. We propose +to leverage coord-based neural networks to represent such warpings and +blendings of face images. During training, we exploit the smoothness and +flexibility of such networks by combining energy functionals employed in +classical approaches without discretizations. Additionally, our method is +time-dependent, allowing a continuous warping/blending of the images. During +morphing inference, we need both direct and inverse transformations of the +time-dependent warping. The first (second) is responsible for warping the +target (source) image into the source (target) image. Our neural warping stores +those maps in a single network dismissing the need for inverting them. The +results of our experiments indicate that our method is competitive with both +classical and generative models under the lens of image quality and +face-morphing detectors. Aesthetically, the resulting images present a seamless +blending of diverse faces not yet usual in the literature. + +
+
+ comment: 14 pages, 20 figures, accepted for CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SegForestNet: Spatial-Partitioning-Based Aerial Image Segmentation + + +
+ Aerial image segmentation is the basis for applications such as automatically +creating maps or tracking deforestation. In true orthophotos, which are often +used in these applications, many objects and regions can be approximated well +by polygons. However, this fact is rarely exploited by state-of-the-art +semantic segmentation models. Instead, most models allow unnecessary degrees of +freedom in their predictions by allowing arbitrary region shapes. We therefore +present a refinement of our deep learning model which predicts binary space +partitioning trees, an efficient polygon representation. The refinements +include a new feature decoder architecture and a new differentiable BSP tree +renderer which both avoid vanishing gradients. Additionally, we designed a +novel loss function specifically designed to improve the spatial partitioning +defined by the predicted trees. Furthermore, our expanded model can predict +multiple trees at once and thus can predict class-specific segmentations. As an +additional contribution, we investigate the impact of a non-optimal training +process in comparison to an optimized training process. While model +architectures optimized for aerial images, such as PFNet or our own model, show +an advantage under non-optimal conditions, this advantage disappears under +optimal training conditions. Despite this observation, our model still makes +better predictions for small rectangular objects, e.g., cars. + +
+
+
+
+
+ + ♻ ☆ Synthetic data shuffling accelerates the convergence of federated + learning under data heterogeneity + + +
+ In federated learning, data heterogeneity is a critical challenge. A +straightforward solution is to shuffle the clients' data to homogenize the +distribution. However, this may violate data access rights, and how and when +shuffling can accelerate the convergence of a federated optimization algorithm +is not theoretically well understood. In this paper, we establish a precise and +quantifiable correspondence between data heterogeneity and parameters in the +convergence rate when a fraction of data is shuffled across clients. We prove +that shuffling can quadratically reduce the gradient dissimilarity with respect +to the shuffling percentage, accelerating convergence. Inspired by the theory, +we propose a practical approach that addresses the data access rights issue by +shuffling locally generated synthetic data. The experimental results show that +shuffling synthetic data improves the performance of multiple existing +federated learning algorithms by a large margin. + +
+
+ comment: Accepted at TMLR +
+
+
+
+
+ + ♻ ☆ Learning Optical Flow and Scene Flow with Bidirectional Camera-LiDAR + Fusion TPAMI 2023 + + +
+ In this paper, we study the problem of jointly estimating the optical flow +and scene flow from synchronized 2D and 3D data. Previous methods either employ +a complex pipeline that splits the joint task into independent stages, or fuse +2D and 3D information in an ``early-fusion'' or ``late-fusion'' manner. Such +one-size-fits-all approaches suffer from a dilemma of failing to fully utilize +the characteristic of each modality or to maximize the inter-modality +complementarity. To address the problem, we propose a novel end-to-end +framework, which consists of 2D and 3D branches with multiple bidirectional +fusion connections between them in specific layers. Different from previous +work, we apply a point-based 3D branch to extract the LiDAR features, as it +preserves the geometric structure of point clouds. To fuse dense image features +and sparse point features, we propose a learnable operator named bidirectional +camera-LiDAR fusion module (Bi-CLFM). We instantiate two types of the +bidirectional fusion pipeline, one based on the pyramidal coarse-to-fine +architecture (dubbed CamLiPWC), and the other one based on the recurrent +all-pairs field transforms (dubbed CamLiRAFT). On FlyingThings3D, both CamLiPWC +and CamLiRAFT surpass all existing methods and achieve up to a 47.9\% reduction +in 3D end-point-error from the best published result. Our best-performing +model, CamLiRAFT, achieves an error of 4.26\% on the KITTI Scene Flow +benchmark, ranking 1st among all submissions with much fewer parameters. +Besides, our methods have strong generalization performance and the ability to +handle non-rigid motion. Code is available at +https://github.com/MCG-NJU/CamLiFlow. + +
+
+ comment: Accepted to TPAMI 2023 +
+
+
+
+
+ + ♻ ☆ Burst Super-Resolution with Diffusion Models for Improving Perceptual + Quality + + +
+ While burst LR images are useful for improving the SR image quality compared +with a single LR image, prior SR networks accepting the burst LR images are +trained in a deterministic manner, which is known to produce a blurry SR image. +In addition, it is difficult to perfectly align the burst LR images, making the +SR image more blurry. Since such blurry images are perceptually degraded, we +aim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity +images can be reconstructed by diffusion models. However, prior SR methods +using the diffusion model are not properly optimized for the burst SR task. +Specifically, the reverse process starting from a random sample is not +optimized for image enhancement and restoration methods, including burst SR. In +our proposed method, on the other hand, burst LR features are used to +reconstruct the initial burst SR image that is fed into an intermediate step in +the diffusion model. This reverse process from the intermediate step 1) skips +diffusion steps for reconstructing the global structure of the image and 2) +focuses on steps for refining detailed textures. Our experimental results +demonstrate that our method can improve the scores of the perceptual quality +metrics. Code: https://github.com/placerkyo/BSRD + +
+
+ comment: Accepted to IJCNN 2024 (International Joint Conference on Neural + Networks) +
+
+
+
+
+ + ♻ ☆ Gyro-based Neural Single Image Deblurring + + +
+ In this paper, we present GyroDeblurNet, a novel single image deblurring +method that utilizes a gyro sensor to effectively resolve the ill-posedness of +image deblurring. The gyro sensor provides valuable information about camera +motion during exposure time that can significantly improve deblurring quality. +However, effectively exploiting real-world gyro data is challenging due to +significant errors from various sources including sensor noise, the disparity +between the positions of a camera module and a gyro sensor, the absence of +translational motion information, and moving objects whose motions cannot be +captured by a gyro sensor. To handle gyro error, GyroDeblurNet is equipped with +two novel neural network blocks: a gyro refinement block and a gyro deblurring +block. The gyro refinement block refines the error-ridden gyro data using the +blur information from the input image. On the other hand, the gyro deblurring +block removes blur from the input image using the refined gyro data and further +compensates for gyro error by leveraging the blur information from the input +image. For training a neural network with erroneous gyro data, we propose a +training strategy based on the curriculum learning. We also introduce a novel +gyro data embedding scheme to represent real-world intricate camera shakes. +Finally, we present a synthetic dataset and a real dataset for the training and +evaluation of gyro-based single image deblurring. Our experiments demonstrate +that our approach achieves state-of-the-art deblurring quality by effectively +utilizing erroneous gyro data. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ A Comprehensive Review of Knowledge Distillation in Computer Vision + + +
+ Deep learning techniques have been demonstrated to surpass preceding +cutting-edge machine learning techniques in recent years, with computer vision +being one of the most prominent examples. However, deep learning models suffer +from significant drawbacks when deployed in resource-constrained environments +due to their large model size and high complexity. Knowledge Distillation is +one of the prominent solutions to overcome this challenge. This review paper +examines the current state of research on knowledge distillation, a technique +for compressing complex models into smaller and simpler ones. The paper +provides an overview of the major principles and techniques associated with +knowledge distillation and reviews the applications of knowledge distillation +in the domain of computer vision. The review focuses on the benefits of +knowledge distillation, as well as the problems that must be overcome to +improve its effectiveness. + +
+
+ comment: 36 pages ,10 figures +
+
+
+
+
+ + ♻ ☆ Autoregressive Omni-Aware Outpainting for Open-Vocabulary 360-Degree + Image Generation + + +
+ A 360-degree (omni-directional) image provides an all-encompassing spherical +view of a scene. Recently, there has been an increasing interest in +synthesising 360-degree images from conventional narrow field of view (NFoV) +images captured by digital cameras and smartphones, for providing immersive +experiences in various scenarios such as virtual reality. Yet, existing methods +typically fall short in synthesizing intricate visual details or ensure the +generated images align consistently with user-provided prompts. In this study, +autoregressive omni-aware generative network (AOG-Net) is proposed for +360-degree image generation by out-painting an incomplete 360-degree image +progressively with NFoV and text guidances joinly or individually. This +autoregressive scheme not only allows for deriving finer-grained and +text-consistent patterns by dynamically generating and adjusting the process +but also offers users greater flexibility to edit their conditions throughout +the generation process. A global-local conditioning mechanism is devised to +comprehensively formulate the outpainting guidance in each autoregressive step. +Text guidances, omni-visual cues, NFoV inputs and omni-geometry are encoded and +further formulated with cross-attention based transformers into a global stream +and a local stream into a conditioned generative backbone model. As AOG-Net is +compatible to leverage large-scale models for the conditional encoder and the +generative prior, it enables the generation to use extensive open-vocabulary +text guidances. Comprehensive experiments on two commonly used 360-degree image +datasets for both indoor and outdoor settings demonstrate the state-of-the-art +performance of our proposed method. Our code will be made publicly available. + +
+
+ comment: Accepted by AAAI 24 +
+
+
+
+
+ + ♻ ☆ Feature 3DGS: Supercharging 3D Gaussian Splatting to Enable Distilled + Feature Fields + + +
+ 3D scene representations have gained immense popularity in recent years. +Methods that use Neural Radiance fields are versatile for traditional tasks +such as novel view synthesis. In recent times, some work has emerged that aims +to extend the functionality of NeRF beyond view synthesis, for semantically +aware tasks such as editing and segmentation using 3D feature field +distillation from 2D foundation models. However, these methods have two major +limitations: (a) they are limited by the rendering speed of NeRF pipelines, and +(b) implicitly represented feature fields suffer from continuity artifacts +reducing feature quality. Recently, 3D Gaussian Splatting has shown +state-of-the-art performance on real-time radiance field rendering. In this +work, we go one step further: in addition to radiance field rendering, we +enable 3D Gaussian splatting on arbitrary-dimension semantic features via 2D +foundation model distillation. This translation is not straightforward: naively +incorporating feature fields in the 3DGS framework encounters significant +challenges, notably the disparities in spatial resolution and channel +consistency between RGB images and feature maps. We propose architectural and +training changes to efficiently avert this problem. Our proposed method is +general, and our experiments showcase novel view semantic segmentation, +language-guided editing and segment anything through learning feature fields +from state-of-the-art 2D foundation models such as SAM and CLIP-LSeg. Across +experiments, our distillation method is able to provide comparable or better +results, while being significantly faster to both train and render. +Additionally, to the best of our knowledge, we are the first method to enable +point and bounding-box prompting for radiance field manipulation, by leveraging +the SAM model. Project website at: https://feature-3dgs.github.io/ + +
+
+
+
+
+ + ♻ ☆ Unifying Correspondence, Pose and NeRF for Pose-Free Novel View + Synthesis from Stereo Pairs CVPR2024 + + +
+ This work delves into the task of pose-free novel view synthesis from stereo +pairs, a challenging and pioneering task in 3D vision. Our innovative +framework, unlike any before, seamlessly integrates 2D correspondence matching, +camera pose estimation, and NeRF rendering, fostering a synergistic enhancement +of these tasks. We achieve this through designing an architecture that utilizes +a shared representation, which serves as a foundation for enhanced 3D geometry +understanding. Capitalizing on the inherent interplay between the tasks, our +unified framework is trained end-to-end with the proposed training strategy to +improve overall model accuracy. Through extensive evaluations across diverse +indoor and outdoor scenes from two real-world datasets, we demonstrate that our +approach achieves substantial improvement over previous methodologies, +especially in scenarios characterized by extreme viewpoint changes and the +absence of accurate camera poses. + +
+
+ comment: Project page: https://ku-cvlab.github.io/CoPoNeRF/ CVPR2024 camera + ready version (Highlight) +
+
+
+
+
+ + ♻ ☆ UAV-Rain1k: A Benchmark for Raindrop Removal from UAV Aerial Imagery CVPR + + +
+ Raindrops adhering to the lens of UAVs can obstruct visibility of the +background scene and degrade image quality. Despite recent progress in image +deraining methods and datasets, there is a lack of focus on raindrop removal +from UAV aerial imagery due to the unique challenges posed by varying angles +and rapid movement during drone flight. To fill the gap in this research, we +first construct a new benchmark dataset for removing raindrops from UAV images, +called UAV-Rain1k. In this letter, we provide a dataset generation pipeline, +which includes modeling raindrop shapes using Blender, collecting background +images from various UAV angles, random sampling of rain masks and etc. Based on +the proposed benchmark, we further present a comprehensive evaluation of +existing representative image deraining algorithms, and reveal future research +opportunities worth exploring. The proposed dataset is publicly available at +https://github.com/cschenxiang/UAV-Rain1k. + +
+
+ comment: Accepted by IEEE/CVF Conference on Computer Vision and Pattern + Recognition Workshops (CVPRW) 2024 +
+
+
+
+
+ + ♻ ☆ Fully Sparse 3D Occupancy Prediction + + +
+ Occupancy prediction plays a pivotal role in autonomous driving. Previous +methods typically construct dense 3D volumes, neglecting the inherent sparsity +of the scene and suffering high computational costs. To bridge the gap, we +introduce a novel fully sparse occupancy network, termed SparseOcc. SparseOcc +initially reconstructs a sparse 3D representation from visual inputs and +subsequently predicts semantic/instance occupancy from the 3D sparse +representation by sparse queries. A mask-guided sparse sampling is designed to +enable sparse queries to interact with 2D features in a fully sparse manner, +thereby circumventing costly dense features or global attention. Additionally, +we design a thoughtful ray-based evaluation metric, namely RayIoU, to solve the +inconsistency penalty along depths raised in traditional voxel-level mIoU +criteria. SparseOcc demonstrates its effectiveness by achieving a RayIoU of +34.0, while maintaining a real-time inference speed of 17.3 FPS, with 7 history +frames inputs. By incorporating more preceding frames to 15, SparseOcc +continuously improves its performance to 35.1 RayIoU without whistles and +bells. Code is available at https://github.com/MCG-NJU/SparseOcc. + +
+
+ comment: Add new metric: RayIoU +
+
+
+
+
+ + ♻ ☆ Enhancing Ship Classification in Optical Satellite Imagery: Integrating + Convolutional Block Attention Module with ResNet for Improved Performance + + +
+ This study presents an advanced Convolutional Neural Network (CNN) +architecture for ship classification from optical satellite imagery, +significantly enhancing performance through the integration of the +Convolutional Block Attention Module (CBAM) and additional architectural +innovations. Building upon the foundational ResNet50 model, we first +incorporated a standard CBAM to direct the model's focus towards more +informative features, achieving an accuracy of 87% compared to the baseline +ResNet50's 85%. Further augmentations involved multi-scale feature integration, +depthwise separable convolutions, and dilated convolutions, culminating in the +Enhanced ResNet Model with Improved CBAM. This model demonstrated a remarkable +accuracy of 95%, with precision, recall, and f1-scores all witnessing +substantial improvements across various ship classes. The bulk carrier and oil +tanker classes, in particular, showcased nearly perfect precision and recall +rates, underscoring the model's enhanced capability in accurately identifying +and classifying ships. Attention heatmap analyses further validated the +improved model's efficacy, revealing a more focused attention on relevant ship +features, regardless of background complexities. These findings underscore the +potential of integrating attention mechanisms and architectural innovations in +CNNs for high-resolution satellite imagery classification. The study navigates +through the challenges of class imbalance and computational costs, proposing +future directions towards scalability and adaptability in new or rare ship type +recognition. This research lays a groundwork for the application of advanced +deep learning techniques in the domain of remote sensing, offering insights +into scalable and efficient satellite image classification. + +
+
+
+
+
+ + ♻ ☆ Holistic Inverse Rendering of Complex Facade via Aerial 3D Scanning + + +
+ In this work, we use multi-view aerial images to reconstruct the geometry, +lighting, and material of facades using neural signed distance fields (SDFs). +Without the requirement of complex equipment, our method only takes simple RGB +images captured by a drone as inputs to enable physically based and +photorealistic novel-view rendering, relighting, and editing. However, a +real-world facade usually has complex appearances ranging from diffuse rocks +with subtle details to large-area glass windows with specular reflections, +making it hard to attend to everything. As a result, previous methods can +preserve the geometry details but fail to reconstruct smooth glass windows or +verse vise. In order to address this challenge, we introduce three spatial- and +semantic-adaptive optimization strategies, including a semantic regularization +approach based on zero-shot segmentation techniques to improve material +consistency, a frequency-aware geometry regularization to balance surface +smoothness and details in different surfaces, and a visibility probe-based +scheme to enable efficient modeling of the local lighting in large-scale +outdoor environments. In addition, we capture a real-world facade aerial 3D +scanning image set and corresponding point clouds for training and +benchmarking. The experiment demonstrates the superior quality of our method on +facade holistic inverse rendering, novel view synthesis, and scene editing +compared to state-of-the-art baselines. + +
+
+
+
+
+ + ♻ ☆ Temporally Consistent Unbalanced Optimal Transport for Unsupervised + Action Segmentation CVPR 2024 + + +
+ We propose a novel approach to the action segmentation task for long, +untrimmed videos, based on solving an optimal transport problem. By encoding a +temporal consistency prior into a Gromov-Wasserstein problem, we are able to +decode a temporally consistent segmentation from a noisy affinity/matching cost +matrix between video frames and action classes. Unlike previous approaches, our +method does not require knowing the action order for a video to attain temporal +consistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can +be efficiently solved on GPUs using a few iterations of projected mirror +descent. We demonstrate the effectiveness of our method in an unsupervised +learning setting, where our method is used to generate pseudo-labels for +self-training. We evaluate our segmentation approach and unsupervised learning +pipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly +datasets, yielding state-of-the-art results for the unsupervised video action +segmentation task. + +
+
+ comment: Accepted to CVPR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ MVSA-Net: Multi-View State-Action Recognition for Robust and Deployable + Trajectory Generation + + +
+ The learn-from-observation (LfO) paradigm is a human-inspired mode for a +robot to learn to perform a task simply by watching it being performed. LfO can +facilitate robot integration on factory floors by minimizing disruption and +reducing tedious programming. A key component of the LfO pipeline is a +transformation of the depth camera frames to the corresponding task state and +action pairs, which are then relayed to learning techniques such as imitation +or inverse reinforcement learning for understanding the task parameters. While +several existing computer vision models analyze videos for activity +recognition, SA-Net specifically targets robotic LfO from RGB-D data. However, +SA-Net and many other models analyze frame data captured from a single +viewpoint. Their analysis is therefore highly sensitive to occlusions of the +observed task, which are frequent in deployments. An obvious way of reducing +occlusions is to simultaneously observe the task from multiple viewpoints and +synchronously fuse the multiple streams in the model. Toward this, we present +multi-view SA-Net, which generalizes the SA-Net model to allow the perception +of multiple viewpoints of the task activity, integrate them, and better +recognize the state and action in each frame. Performance evaluations on two +distinct domains establish that MVSA-Net recognizes the state-action pairs +under occlusion more accurately compared to single-view MVSA-Net and other +baselines. Our ablation studies further evaluate its performance under +different ambient conditions and establish the contribution of the architecture +components. As such, MVSA-Net offers a significantly more robust and deployable +state-action trajectory generation compared to previous methods. + +
+
+ comment: Presented at Deployable AI Workshop at AAAI-2024 and 'Towards + Reliable and Deployable Learning-Based Robotic Systems' Workshop at CoRL2023 +
+
+
+
+
+ + ♻ ☆ And Then the Hammer Broke: Reflections on Machine Ethics from Feminist + Philosophy of Science + + +
+ Vision is an important metaphor in ethical and political questions of +knowledge. The feminist philosopher Donna Haraway points out the ``perverse'' +nature of an intrusive, alienating, all-seeing vision (to which we might cry +out ``stop looking at me!''), but also encourages us to embrace the embodied +nature of sight and its promises for genuinely situated knowledge. Current +technologies of machine vision -- surveillance cameras, drones (for war or +recreation), iPhone cameras -- are usually construed as instances of the former +rather than the latter, and for good reasons. However, although in no way +attempting to diminish the real suffering these technologies have brought about +in the world, I make the case for understanding technologies of computer vision +as material instances of embodied seeing and situated knowing. Furthermore, +borrowing from Iris Murdoch's concept of moral vision, I suggest that these +technologies direct our labor towards self-reflection in ethically significant +ways. My approach draws upon paradigms in computer vision research, +phenomenology, and feminist epistemology. Ultimately, this essay is an argument +for directing more philosophical attention from merely criticizing technologies +of vision as ethically deficient towards embracing them as complex, +methodologically and epistemologically important objects. + +
+
+ comment: Pacific University Philosophy Conference +
+
+
+
+
+ + ♻ ☆ 3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple + 3D Representations + + +
+ Imitation learning provides an efficient way to teach robots dexterous +skills; however, learning complex skills robustly and generalizablely usually +consumes large amounts of human demonstrations. To tackle this challenging +problem, we present 3D Diffusion Policy (DP3), a novel visual imitation +learning approach that incorporates the power of 3D visual representations into +diffusion policies, a class of conditional action generative models. The core +design of DP3 is the utilization of a compact 3D visual representation, +extracted from sparse point clouds with an efficient point encoder. In our +experiments involving 72 simulation tasks, DP3 successfully handles most tasks +with just 10 demonstrations and surpasses baselines with a 24.2% relative +improvement. In 4 real robot tasks, DP3 demonstrates precise control with a +high success rate of 85%, given only 40 demonstrations of each task, and shows +excellent generalization abilities in diverse aspects, including space, +viewpoint, appearance, and instance. Interestingly, in real robot experiments, +DP3 rarely violates safety requirements, in contrast to baseline methods which +frequently do, necessitating human intervention. Our extensive evaluation +highlights the critical importance of 3D representations in real-world robot +learning. Videos, code, and data are available on +https://3d-diffusion-policy.github.io . + +
+
+ comment: Videos, code, and data: https://3d-diffusion-policy.github.io +
+
+
+
+
+ + ♻ ☆ 360+x: A Panoptic Multi-modal Scene Understanding Dataset CVPR 2024 + + +
+ Human perception of the world is shaped by a multitude of viewpoints and +modalities. While many existing datasets focus on scene understanding from a +certain perspective (e.g. egocentric or third-person views), our dataset offers +a panoptic perspective (i.e. multiple viewpoints with multiple data +modalities). Specifically, we encapsulate third-person panoramic and front +views, as well as egocentric monocular/binocular views with rich modalities +including video, multi-channel audio, directional binaural delay, location data +and textual scene descriptions within each scene captured, presenting +comprehensive observation of the world. Figure 1 offers a glimpse of all 28 +scene categories of our 360+x dataset. To the best of our knowledge, this is +the first database that covers multiple viewpoints with multiple data +modalities to mimic how daily information is accessed in the real world. +Through our benchmark analysis, we presented 5 different scene understanding +tasks on the proposed 360+x dataset to evaluate the impact and benefit of each +data modality and perspective in panoptic scene understanding. We hope this +unique dataset could broaden the scope of comprehensive scene understanding and +encourage the community to approach these problems from more diverse +perspectives. + +
+
+ comment: CVPR 2024 (Oral Presentation), Project page: + https://x360dataset.github.io/ +
+
+
+
+
+ + ♻ ☆ A Benchmark Grocery Dataset of Realworld Point Clouds From Single View + + +
+ Fine-grained grocery object recognition is an important computer vision +problem with broad applications in automatic checkout, in-store robotic +navigation, and assistive technologies for the visually impaired. Existing +datasets on groceries are mainly 2D images. Models trained on these datasets +are limited to learning features from the regular 2D grids. While portable 3D +sensors such as Kinect were commonly available for mobile phones, sensors such +as LiDAR and TrueDepth, have recently been integrated into mobile phones. +Despite the availability of mobile 3D sensors, there are currently no dedicated +real-world large-scale benchmark 3D datasets for grocery. In addition, existing +3D datasets lack fine-grained grocery categories and have limited training +samples. Furthermore, collecting data by going around the object versus the +traditional photo capture makes data collection cumbersome. Thus, we introduce +a large-scale grocery dataset called 3DGrocery100. It constitutes 100 classes, +with a total of 87,898 3D point clouds created from 10,755 RGB-D single-view +images. We benchmark our dataset on six recent state-of-the-art 3D point cloud +classification models. Additionally, we also benchmark the dataset on few-shot +and continual learning point cloud classification tasks. Project Page: +https://bigdatavision.org/3DGrocery100/. + +
+
+
+
+
+ + ♻ ☆ Linear Combination of Saved Checkpoints Makes Consistency and Diffusion + Models Better + + +
+ Diffusion Models (DM) and Consistency Models (CM) are two types of popular +generative models with good generation quality on various tasks. When training +DM and CM, intermediate weight checkpoints are not fully utilized and only the +last converged checkpoint is used. In this work, we find that high-quality +model weights often lie in a basin which cannot be reached by SGD but can be +obtained by proper checkpoint averaging. Based on these observations, we +propose LCSC, a simple but effective and efficient method to enhance the +performance of DM and CM, by combining checkpoints along the training +trajectory with coefficients deduced from evolutionary search. We demonstrate +the value of LCSC through two use cases: $\textbf{(a) Reducing training cost.}$ +With LCSC, we only need to train DM/CM with fewer number of iterations and/or +lower batch sizes to obtain comparable sample quality with the fully trained +model. For example, LCSC achieves considerable training speedups for CM +(23$\times$ on CIFAR-10 and 15$\times$ on ImageNet-64). $\textbf{(b) Enhancing +pre-trained models.}$ Assuming full training is already done, LCSC can further +improve the generation quality or speed of the final converged models. For +example, LCSC achieves better performance using 1 number of function evaluation +(NFE) than the base model with 2 NFE on consistency distillation, and decreases +the NFE of DM from 15 to 9 while maintaining the generation quality on +CIFAR-10. Our code is available at +https://github.com/imagination-research/LCSC. + +
+
+
+
+
+ + ♻ ☆ S$^{5}$Mars: Semi-Supervised Learning for Mars Semantic Segmentation + + +
+ Deep learning has become a powerful tool for Mars exploration. Mars terrain +semantic segmentation is an important Martian vision task, which is the base of +rover autonomous planning and safe driving. However, there is a lack of +sufficient detailed and high-confidence data annotations, which are exactly +required by most deep learning methods to obtain a good model. To address this +problem, we propose our solution from the perspective of joint data and method +design. We first present a newdataset S5Mars for Semi-SuperviSed learning on +Mars Semantic Segmentation, which contains 6K high-resolution images and is +sparsely annotated based on confidence, ensuring the high quality of labels. +Then to learn from this sparse data, we propose a semi-supervised learning +(SSL) framework for Mars image semantic segmentation, to learn representations +from limited labeled data. Different from the existing SSL methods which are +mostly targeted at the Earth image data, our method takes into account Mars +data characteristics. Specifically, we first investigate the impact of current +widely used natural image augmentations on Mars images. Based on the analysis, +we then proposed two novel and effective augmentations for SSL of Mars +segmentation, AugIN and SAM-Mix, which serve as strong augmentations to boost +the model performance. Meanwhile, to fully leverage the unlabeled data, we +introduce a soft-to-hard consistency learning strategy, learning from different +targets based on prediction confidence. Experimental results show that our +method can outperform state-of-the-art SSL approaches remarkably. Our proposed +dataset is available at https://jhang2020.github.io/S5Mars.github.io/. + +
+
+ comment: IEEE TGRS 2024 +
+
+
+
+
+ + ♻ ☆ OmniGS: Omnidirectional Gaussian Splatting for Fast Radiance Field + Reconstruction using Omnidirectional Images + + +
+ Photorealistic reconstruction relying on 3D Gaussian Splatting has shown +promising potential in robotics. However, the current 3D Gaussian Splatting +system only supports radiance field reconstruction using undistorted +perspective images. In this paper, we present OmniGS, a novel omnidirectional +Gaussian splatting system, to take advantage of omnidirectional images for fast +radiance field reconstruction. Specifically, we conduct a theoretical analysis +of spherical camera model derivatives in 3D Gaussian Splatting. According to +the derivatives, we then implement a new GPU-accelerated omnidirectional +rasterizer that directly splats 3D Gaussians onto the equirectangular screen +space for omnidirectional image rendering. As a result, we realize +differentiable optimization of the radiance field without the requirement of +cube-map rectification or tangent-plane approximation. Extensive experiments +conducted in egocentric and roaming scenarios demonstrate that our method +achieves state-of-the-art reconstruction quality and high rendering speed using +omnidirectional images. To benefit the research community, the code will be +made publicly available once the paper is published. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Single Mesh Diffusion Models with Field Latents for Texture Generation CVPR 2024 + + +
+ We introduce a framework for intrinsic latent diffusion models operating +directly on the surfaces of 3D shapes, with the goal of synthesizing +high-quality textures. Our approach is underpinned by two contributions: field +latents, a latent representation encoding textures as discrete vector fields on +the mesh vertices, and field latent diffusion models, which learn to denoise a +diffusion process in the learned latent space on the surface. We consider a +single-textured-mesh paradigm, where our models are trained to generate +variations of a given texture on a mesh. We show the synthesized textures are +of superior fidelity compared those from existing single-textured-mesh +generative models. Our models can also be adapted for user-controlled editing +tasks such as inpainting and label-guided generation. The efficacy of our +approach is due in part to the equivariance of our proposed framework under +isometries, allowing our models to seamlessly reproduce details across locally +similar regions and opening the door to a notion of generative texture +transfer. + +
+
+ comment: CVPR 2024. Code and additional visualizations available: + https://single-mesh-diffusion.github.io/ +
+
+
+
+
+ + ♻ ☆ GMISeg: General Medical Image Segmentation without Re-Training + + +
+ Although deep learning models have become the main method for medical image +segmentation, they often cannot be extended to unknown segmentation tasks +involving new anatomical structures, image shapes, or labels. For new +segmentation tasks, researchers often have to retrain or fine-tune the model, +which is time-consuming and poses a significant obstacle to clinical +researchers, who often lack the resources and professional knowledge to train +neural networks. Therefore, we proposed a general method that can solve unknown +medical image segmentation tasks without requiring additional training. Given +an example set of images and prompts for defining new segmentation tasks, +GMISeg applies a novel low-rank fine-tuning strategy based on the proposed +approach to the SAM (Segment Anything Model) image encoder, and works with the +prompt encoder and mask decoder to fine-tune the labeled dataset without the +need for additional training. To achieve generalization of new tasks, we used +medical image datasets with different imaging modes for different parts. We +trained and generalized GMISeg on a different set of anatomical and imaging +modes using cardiac images on other site datasets. We have demonstrated that +GMISeg outperforms the latest methods on unknown tasks and have conducted a +comprehensive analysis and summary of the important performance of the proposed +method. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 21 + +
+
+
+ + ☆ Legibot: Generating Legible Motions for Service Robots Using Cost-Based + Local Planners + + +
+ With the increasing presence of social robots in various environments and +applications, there is an increasing need for these robots to exhibit +socially-compliant behaviors. Legible motion, characterized by the ability of a +robot to clearly and quickly convey intentions and goals to the individuals in +its vicinity, through its motion, holds significant importance in this context. +This will improve the overall user experience and acceptance of robots in human +environments. In this paper, we introduce a novel approach to incorporate +legibility into local motion planning for mobile robots. This can enable robots +to generate legible motions in real-time and dynamic environments. To +demonstrate the effectiveness of our proposed methodology, we also provide a +robotic stack designed for deploying legibility-aware motion planning in a +social robot, by integrating perception and localization components. + +
+
+
+
+
+ + ☆ PCBot: a Minimalist Robot Designed for Swarm Applications IROS 2022 + + +
+ Complexity, cost, and power requirements for the actuation of individual +robots can play a large factor in limiting the size of robotic swarms. Here we +present PCBot, a minimalist robot that can precisely move on an orbital shake +table using a bi-stable solenoid actuator built directly into its PCB. This +allows the actuator to be built as part of the automated PCB manufacturing +process, greatly reducing the impact it has on manual assembly. Thanks to this +novel actuator design, PCBot has merely five major components and can be +assembled in under 20 seconds, potentially enabling them to be easily +mass-manufactured. Here we present the electro-magnetic and mechanical design +of PCBot. Additionally, a prototype robot is used to demonstrate its ability to +move in a straight line as well as follow given paths. + +
+
+ comment: Accepted by IROS 2022, best paper and best mechanism design paper + finalist +
+
+
+
+
+ + ☆ On the Uniqueness of Solution for the Bellman Equation of LTL Objectives + + +
+ Surrogate rewards for linear temporal logic (LTL) objectives are commonly +utilized in planning problems for LTL objectives. In a widely-adopted surrogate +reward approach, two discount factors are used to ensure that the expected +return approximates the satisfaction probability of the LTL objective. The +expected return then can be estimated by methods using the Bellman updates such +as reinforcement learning. However, the uniqueness of the solution to the +Bellman equation with two discount factors has not been explicitly discussed. +We demonstrate with an example that when one of the discount factors is set to +one, as allowed in many previous works, the Bellman equation may have multiple +solutions, leading to inaccurate evaluation of the expected return. We then +propose a condition for the Bellman equation to have the expected return as the +unique solution, requiring the solutions for states inside a rejecting bottom +strongly connected component (BSCC) to be 0. We prove this condition is +sufficient by showing that the solutions for the states with discounting can be +separated from those for the states without discounting under this condition + +
+
+ comment: Accepted for the 2024 Learning for Dynamics and Control Conference + (L4DC) +
+
+
+
+
+ + ☆ Adaptive Anchor Pairs Selection in a TDOA-based System Through Robot + Localization Error Minimization + + +
+ The following paper presents an adaptive anchor pairs selection method for +ultra-wideband (UWB) Time Difference of Arrival (TDOA) based positioning +systems. The method divides the area covered by the system into several zones +and assigns them anchor pair sets. The pair sets are determined during +calibration based on localization root mean square error (RMSE). The +calibration assumes driving a mobile platform equipped with a LiDAR sensor and +a UWB tag through the specified zones. The robot is localized separately based +on a large set of different TDOA pairs and using a LiDAR, which acts as the +reference. For each zone, the TDOA pairs set for which the registered RMSE is +lowest is selected and used for localization in the routine system work. The +proposed method has been tested with simulations and experiments. The results +for both simulated static and experimental dynamic scenarios have proven that +the adaptive selection of the anchor nodes leads to an increase in localization +accuracy. In the experiment, the median trajectory error for a moving person +localization was at a level of 25 cm. + +
+
+ comment: Originally presented at: 2021 Signal Processing Symposium (SPSympo), + LODZ, Poland, 2021 +
+
+
+
+
+ + ☆ StaccaToe: A Single-Leg Robot that Mimics the Human Leg and Toe IROS 2024 + + +
+ We introduce StaccaToe, a human-scale, electric motor-powered single-leg +robot designed to rival the agility of human locomotion through two distinctive +attributes: an actuated toe and a co-actuation configuration inspired by the +human leg. Leveraging the foundational design of HyperLeg's lower leg +mechanism, we develop a stand-alone robot by incorporating new link designs, +custom-designed power electronics, and a refined control system. Unlike +previous jumping robots that rely on either special mechanisms (e.g., springs +and clutches) or hydraulic/pneumatic actuators, StaccaToe employs electric +motors without energy storage mechanisms. This choice underscores our ultimate +goal of developing a practical, high-performance humanoid robot capable of +human-like, stable walking as well as explosive dynamic movements. In this +paper, we aim to empirically evaluate the balance capability and the exertion +of explosive ground reaction forces of our toe and co-actuation mechanisms. +Throughout extensive hardware and controller development, StaccaToe showcases +its control fidelity by demonstrating a balanced tip-toe stance and dynamic +jump. This study is significant for three key reasons: 1) StaccaToe represents +the first human-scale, electric motor-driven single-leg robot to execute +dynamic maneuvers without relying on specialized mechanisms; 2) our research +provides empirical evidence of the benefits of replicating critical human leg +attributes in robotic design; and 3) we explain the design process for creating +agile legged robots, the details that have been scantily covered in academic +literature. + +
+
+ comment: Submitted to 2024 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2024) +
+
+
+
+
+ + ☆ PathFinder: Attention-Driven Dynamic Non-Line-of-Sight Tracking with a + Mobile Robot + + +
+ The study of non-line-of-sight (NLOS) imaging is growing due to its many +potential applications, including rescue operations and pedestrian detection by +self-driving cars. However, implementing NLOS imaging on a moving camera +remains an open area of research. Existing NLOS imaging methods rely on +time-resolved detectors and laser configurations that require precise optical +alignment, making it difficult to deploy them in dynamic environments. This +work proposes a data-driven approach to NLOS imaging, PathFinder, that can be +used with a standard RGB camera mounted on a small, power-constrained mobile +robot, such as an aerial drone. Our experimental pipeline is designed to +accurately estimate the 2D trajectory of a person who moves in a +Manhattan-world environment while remaining hidden from the camera's +field-of-view. We introduce a novel approach to process a sequence of dynamic +successive frames in a line-of-sight (LOS) video using an attention-based +neural network that performs inference in real-time. The method also includes a +preprocessing selection metric that analyzes images from a moving camera which +contain multiple vertical planar surfaces, such as walls and building facades, +and extracts planes that return maximum NLOS information. We validate the +approach on in-the-wild scenes using a drone for video capture, thus +demonstrating low-cost NLOS imaging in dynamic capture environments. + +
+
+ comment: First two authors have equal contribution +
+
+
+
+
+ + ☆ Scalable and Efficient Hierarchical Visual Topological Mapping + + +
+ Hierarchical topological representations can significantly reduce search +times within mapping and localization algorithms. Although recent research has +shown the potential for such approaches, limited consideration has been given +to the suitability and comparative performance of different global feature +representations within this context. In this work, we evaluate state-of-the-art +hand-crafted and learned global descriptors using a hierarchical topological +mapping technique on benchmark datasets and present results of a comprehensive +evaluation of the impact of the global descriptor used. Although learned +descriptors have been incorporated into place recognition methods to improve +retrieval accuracy and enhance overall recall, the problem of scalability and +efficiency when applied to longer trajectories has not been adequately +addressed in a majority of research studies. Based on our empirical analysis of +multiple runs, we identify that continuity and distinctiveness are crucial +characteristics for an optimal global descriptor that enable efficient and +scalable hierarchical mapping, and present a methodology for quantifying and +contrasting these characteristics across different global descriptors. Our +study demonstrates that the use of global descriptors based on an unsupervised +learned Variational Autoencoder (VAE) excels in these characteristics and +achieves significantly lower runtime. It runs on a consumer grade desktop, up +to 2.3x faster than the second best global descriptor, NetVLAD, and up to 9.5x +faster than the hand-crafted descriptor, PHOG, on the longest track evaluated +(St Lucia, 17.6 km), without sacrificing overall recall performance. + +
+
+ comment: Published in the 21st International Conference on Advanced Robotics + (ICAR 2023) +
+
+
+
+
+ + ☆ RoboMP$^2$: A Robotic Multimodal Perception-Planning Framework with + Multimodal Large Language Models + + +
+ Multimodal Large Language Models (MLLMs) have shown impressive reasoning +abilities and general intelligence in various domains. It inspires researchers +to train end-to-end MLLMs or utilize large models to generate policies with +human-selected prompts for embodied agents. However, these methods exhibit +limited generalization capabilities on unseen tasks or scenarios, and overlook +the multimodal environment information which is critical for robots to make +decisions. In this paper, we introduce a novel Robotic Multimodal +Perception-Planning (RoboMP$^2$) framework for robotic manipulation which +consists of a Goal-Conditioned Multimodal Preceptor (GCMP) and a +Retrieval-Augmented Multimodal Planner (RAMP). Specially, GCMP captures +environment states by employing a tailored MLLMs for embodied agents with the +abilities of semantic reasoning and localization. RAMP utilizes coarse-to-fine +retrieval method to find the $k$ most-relevant policies as in-context +demonstrations to enhance the planner. Extensive experiments demonstrate the +superiority of RoboMP$^2$ on both VIMA benchmark and real-world tasks, with +around 10% improvement over the baselines. + +
+
+ comment: Project page: https://aopolin-lv.github.io/RoboMP2.github.io/ +
+
+
+
+
+ + ☆ Multi-Type Map Construction via Semantics-Aware Autonomous Exploration + in Unknown Indoor Environments + + +
+ This paper proposes a novel semantics-aware autonomous exploration model to +handle the long-standing issue: the mainstream RRT (Rapid-exploration Random +Tree) based exploration models usually make the mobile robot switch frequently +between different regions, leading to the excessively-repeated explorations for +the same region. Our proposed semantics-aware model encourages a mobile robot +to fully explore the current region before moving to the next region, which is +able to avoid excessively-repeated explorations and make the exploration +faster. The core idea of semantics-aware autonomous exploration model is +optimizing the sampling point selection mechanism and frontier point evaluation +function by considering the semantic information of regions. In addition, +compared with existing autonomous exploration methods that usually construct +the single-type or 2-3 types of maps, our model allows to construct four kinds +of maps including point cloud map, occupancy grid map, topological map, and +semantic map. To test the performance of our model, we conducted experiments in +three simulated environments. The experiment results demonstrate that compared +to Improved RRT, our model achieved 33.0% exploration time reduction and 39.3% +exploration trajectory length reduction when maintaining >98% exploration rate. + +
+
+
+
+
+ + ☆ Prompting Multi-Modal Tokens to Enhance End-to-End Autonomous Driving + Imitation Learning with LLMs + + +
+ The utilization of Large Language Models (LLMs) within the realm of +reinforcement learning, particularly as planners, has garnered a significant +degree of attention in recent scholarly literature. However, a substantial +proportion of existing research predominantly focuses on planning models for +robotics that transmute the outputs derived from perception models into +linguistic forms, thus adopting a `pure-language' strategy. In this research, +we propose a hybrid End-to-End learning framework for autonomous driving by +combining basic driving imitation learning with LLMs based on multi-modality +prompt tokens. Instead of simply converting perception results from the +separated train model into pure language input, our novelty lies in two +aspects. 1) The end-to-end integration of visual and LiDAR sensory input into +learnable multi-modality tokens, thereby intrinsically alleviating description +bias by separated pre-trained perception models. 2) Instead of directly letting +LLMs drive, this paper explores a hybrid setting of letting LLMs help the +driving model correct mistakes and complicated scenarios. The results of our +experiments suggest that the proposed methodology can attain driving scores of +49.21%, coupled with an impressive route completion rate of 91.34% in the +offline evaluation conducted via CARLA. These performance metrics are +comparable to the most advanced driving models. + +
+
+
+
+
+ + ☆ Learning Adaptive Multi-Objective Robot Navigation with Demonstrations + + +
+ Preference-aligned robot navigation in human environments is typically +achieved through learning-based approaches, utilizing demonstrations and user +feedback for personalization. However, personal preferences are subject to +change and might even be context-dependent. Yet traditional reinforcement +learning (RL) approaches with a static reward function often fall short in +adapting to these varying user preferences. This paper introduces a framework +that combines multi-objective reinforcement learning (MORL) with +demonstration-based learning. Our approach allows for dynamic adaptation to +changing user preferences without retraining. Through rigorous evaluations, +including sim-to-real and robot-to-robot transfers, we demonstrate our +framework's capability to reflect user preferences accurately while achieving +high navigational performance in terms of collision avoidance and goal +pursuance. + +
+
+
+
+
+ + ☆ EnQuery: Ensemble Policies for Diverse Query-Generation in Preference + Alignment of Robot Navigation + + +
+ To align mobile robot navigation policies with user preferences through +reinforcement learning from human feedback (RLHF), reliable and +behavior-diverse user queries are required. However, deterministic policies +fail to generate a variety of navigation trajectory suggestions for a given +navigation task configuration. We introduce EnQuery, a query generation +approach using an ensemble of policies that achieve behavioral diversity +through a regularization term. For a given navigation task, EnQuery produces +multiple navigation trajectory suggestions, thereby optimizing the efficiency +of preference data collection with fewer queries. Our methodology demonstrates +superior performance in aligning navigation policies with user preferences in +low-query regimes, offering enhanced policy convergence from sparse preference +queries. The evaluation is complemented with a novel explainability +representation, capturing full scene navigation behavior of the mobile robot in +a single plot. + +
+
+
+
+
+ + ☆ Efficient Reinforcement Learning of Task Planners for Robotic + Palletization through Iterative Action Masking Learning + + +
+ The development of robotic systems for palletization in logistics scenarios +is of paramount importance, addressing critical efficiency and precision +demands in supply chain management. This paper investigates the application of +Reinforcement Learning (RL) in enhancing task planning for such robotic +systems. Confronted with the substantial challenge of a vast action space, +which is a significant impediment to efficiently apply out-of-the-shelf RL +methods, our study introduces a novel method of utilizing supervised learning +to iteratively prune and manage the action space effectively. By reducing the +complexity of the action space, our approach not only accelerates the learning +phase but also ensures the effectiveness and reliability of the task planning +in robotic palletization. The experimental results underscore the efficacy of +this method, highlighting its potential in improving the performance of RL +applications in complex and high-dimensional environments like logistics +palletization. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ☆ Skill Transfer and Discovery for Sim-to-Real Learning: A + Representation-Based Viewpoint + + +
+ We study sim-to-real skill transfer and discovery in the context of robotics +control using representation learning. We draw inspiration from spectral +decomposition of Markov decision processes. The spectral decomposition brings +about representation that can linearly represent the state-action value +function induced by any policies, thus can be regarded as skills. The skill +representations are transferable across arbitrary tasks with the same +transition dynamics. Moreover, to handle the sim-to-real gap in the dynamics, +we propose a skill discovery algorithm that learns new skills caused by the +sim-to-real gap from real-world data. We promote the discovery of new skills by +enforcing orthogonal constraints between the skills to learn and the skills +from simulators, and then synthesize the policy using the enlarged skill sets. +We demonstrate our methodology by transferring quadrotor controllers from +simulators to Crazyflie 2.1 quadrotors. We show that we can learn the skill +representations from a single simulator task and transfer these to multiple +different real-world tasks including hovering, taking off, landing and +trajectory tracking. Our skill discovery approach helps narrow the sim-to-real +gap and improve the real-world controller performance by up to 30.2%. + +
+
+ comment: 9 pages, 6 figures. Project page: + https://congharvard.github.io/steady-sim-to-real/ +
+
+
+
+
+ + ♻ ☆ Wearable Roller Rings to Enable Robot Dexterous In-Hand Manipulation + through Active Surfaces + + +
+ In-hand manipulation is a crucial ability for reorienting and repositioning +objects within grasps. The main challenges are not only the complexity in the +computational models, but also the risks of grasp instability caused by active +finger motions, such as rolling, sliding, breaking, and remaking contacts. +Based on the idea of manipulation without lifting a finger, this paper presents +the development of Roller Rings (RR), a modular robotic attachment with active +surfaces that is wearable by both robot and human hands. By installing and +angling the RRs on grasping systems, such that their spatial motions are not +co-linear, we derive a general differential motion model for the object +actuated by the active surfaces. Our motion model shows that complete in-hand +manipulation skill sets can be provided by as few as only 2 RRs through +non-holonomic object motions, while more RRs can enable enhanced manipulation +dexterity with fewer motion constraints. Through extensive experiments, we wear +RRs on both a robot hand and a human hand to evaluate their manipulation +capabilities, and show that the RRs can be employed to manipulate arbitrary +object shapes to provide dexterous in-hand manipulation. + +
+
+
+
+
+ + ♻ ☆ Federated reinforcement learning for robot motion planning with + zero-shot generalization + + +
+ This paper considers the problem of learning a control policy for robot +motion planning with zero-shot generalization, i.e., no data collection and +policy adaptation is needed when the learned policy is deployed in new +environments. We develop a federated reinforcement learning framework that +enables collaborative learning of multiple learners and a central server, i.e., +the Cloud, without sharing their raw data. In each iteration, each learner +uploads its local control policy and the corresponding estimated normalized +arrival time to the Cloud, which then computes the global optimum among the +learners and broadcasts the optimal policy to the learners. Each learner then +selects between its local control policy and that from the Cloud for next +iteration. The proposed framework leverages on the derived zero-shot +generalization guarantees on arrival time and safety. Theoretical guarantees on +almost-sure convergence, almost consensus, Pareto improvement and optimality +gap are also provided. Monte Carlo simulation is conducted to evaluate the +proposed framework. + +
+
+
+
+
+ + ♻ ☆ Multi-fingered Dynamic Grasping for Unknown Objects + + +
+ Dexterous grasping of unseen objects in dynamic environments is an essential +prerequisite for the advanced manipulation of autonomous robots. Prior advances +rely on several assumptions that simplify the setup, including environment +stationarity, pre-defined objects, and low-dimensional end-effectors. Though +easing the problem and enabling progress, it undermined the complexity of the +real world. Aiming to relax these assumptions, we present a dynamic grasping +framework for unknown objects in this work, which uses a five-fingered hand +with visual servo control and can compensate for external disturbances. To +establish such a system on real hardware, we leverage the recent advances in +real-time dexterous generative grasp synthesis and introduce several techniques +to secure the robustness and performance of the overall system. Our experiments +on real hardware verify the ability of the proposed system to reliably grasp +unknown dynamic objects in two realistic scenarios: objects on a conveyor belt +and human-robot handover. Note that there has been no prior work that can +achieve dynamic multi-fingered grasping for unknown objects like ours up to the +time of writing this paper. We hope our pioneering work in this direction can +provide inspiration to the community and pave the way for further algorithmic +and engineering advances on this challenging task. A video of the experiments +is available at https://youtu.be/b87zGNoKELg. + +
+
+
+
+
+ + ♻ ☆ 3QFP: Efficient neural implicit surface reconstruction using + Tri-Quadtrees and Fourier feature Positional encoding ICRA2024 + + +
+ Neural implicit surface representations are currently receiving a lot of +interest as a means to achieve high-fidelity surface reconstruction at a low +memory cost, compared to traditional explicit representations.However, +state-of-the-art methods still struggle with excessive memory usage and +non-smooth surfaces. This is particularly problematic in large-scale +applications with sparse inputs, as is common in robotics use cases. To address +these issues, we first introduce a sparse structure, \emph{tri-quadtrees}, +which represents the environment using learnable features stored in three +planar quadtree projections. Secondly, we concatenate the learnable features +with a Fourier feature positional encoding. The combined features are then +decoded into signed distance values through a small multi-layer perceptron. We +demonstrate that this approach facilitates smoother reconstruction with a +higher completion ratio with fewer holes. Compared to two recent baselines, one +implicit and one explicit, our approach requires only 10\%--50\% as much +memory, while achieving competitive quality. + +
+
+ comment: ICRA2024 +
+
+
+
+
+ + ♻ ☆ D-VAT: End-to-End Visual Active Tracking for Micro Aerial Vehicles + + +
+ Visual active tracking is a growing research topic in robotics due to its key +role in applications such as human assistance, disaster recovery, and +surveillance. In contrast to passive tracking, active tracking approaches +combine vision and control capabilities to detect and actively track the +target. Most of the work in this area focuses on ground robots, while the very +few contributions on aerial platforms still pose important design constraints +that limit their applicability. To overcome these limitations, in this paper we +propose D-VAT, a novel end-to-end visual active tracking methodology based on +deep reinforcement learning that is tailored to micro aerial vehicle platforms. +The D-VAT agent computes the vehicle thrust and angular velocity commands +needed to track the target by directly processing monocular camera +measurements. We show that the proposed approach allows for precise and +collision-free tracking operations, outperforming different state-of-the-art +baselines on simulated environments which differ significantly from those +encountered during training. Moreover, we demonstrate a smooth real-world +transition to a quadrotor platform with mixed-reality. + +
+
+
+
+
+ + ♻ ☆ Adaptive Force-Based Control of Dynamic Legged Locomotion over Uneven + Terrain + + +
+ Agile-legged robots have proven to be highly effective in navigating and +performing tasks in complex and challenging environments, including disaster +zones and industrial settings. However, these applications normally require the +capability of carrying heavy loads while maintaining dynamic motion. Therefore, +this paper presents a novel methodology for incorporating adaptive control into +a force-based control system. Recent advancements in the control of quadruped +robots show that force control can effectively realize dynamic locomotion over +rough terrain. By integrating adaptive control into the force-based controller, +our proposed approach can maintain the advantages of the baseline framework +while adapting to significant model uncertainties and unknown terrain impact +models. Experimental validation was successfully conducted on the Unitree A1 +robot. With our approach, the robot can carry heavy loads (up to 50% of its +weight) while performing dynamic gaits such as fast trotting and bounding +across uneven terrains. + +
+
+ comment: This work has been published in IEEE Transaction on Robotics (T-RO) +
+
+
+
+
+ + ♻ ☆ Metarobotics for Industry and Society: Vision, Technologies, and + Opportunities + + +
+ Metarobotics aims to combine next generation wireless communication, +multi-sense immersion, and collective intelligence to provide a pervasive, +itinerant, and non-invasive access and interaction with distant robotized +applications. Industry and society are expected to benefit from these +functionalities. For instance, robot programmers will no longer travel +worldwide to plan and test robot motions, even collaboratively. Instead, they +will have a personalized access to robots and their environments from anywhere, +thus spending more time with family and friends. Students enrolled in robotics +courses will be taught under authentic industrial conditions in real-time. This +paper describes objectives of Metarobotics in society, industry, and +in-between. It identifies and surveys technologies likely to enable their +completion and provides an architecture to put forward the interplay of key +components of Metarobotics. Potentials for self-determination, self-efficacy, +and work-life-flexibility in robotics-related applications in Society 5.0, +Industry 4.0, and Industry 5.0 are outlined. + +
+
+ comment: Published on IEEE Transactions on Industrial Informatics, Volume 20, + Issue 4, April 2024 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 5 + +
+
+
+ + ☆ Reconstructing Retinal Visual Images from 3T fMRI Data Enhanced by + Unsupervised Learning + + +
+ The reconstruction of human visual inputs from brain activity, particularly +through functional Magnetic Resonance Imaging (fMRI), holds promising avenues +for unraveling the mechanisms of the human visual system. Despite the +significant strides made by deep learning methods in improving the quality and +interpretability of visual reconstruction, there remains a substantial demand +for high-quality, long-duration, subject-specific 7-Tesla fMRI experiments. The +challenge arises in integrating diverse smaller 3-Tesla datasets or +accommodating new subjects with brief and low-quality fMRI scans. In response +to these constraints, we propose a novel framework that generates enhanced 3T +fMRI data through an unsupervised Generative Adversarial Network (GAN), +leveraging unpaired training across two distinct fMRI datasets in 7T and 3T, +respectively. This approach aims to overcome the limitations of the scarcity of +high-quality 7-Tesla data and the challenges associated with brief and +low-quality scans in 3-Tesla experiments. In this paper, we demonstrate the +reconstruction capabilities of the enhanced 3T fMRI data, highlighting its +proficiency in generating superior input visual images compared to +data-intensive methods trained and tested on a single subject. + +
+
+ comment: Accepted by ISBI 2024 +
+
+
+
+
+ + ☆ VMambaMorph: a Visual Mamba-based Framework with Cross-Scan Module for + Deformable 3D Image Registration + + +
+ Image registration, a critical process in medical imaging, involves aligning +different sets of medical imaging data into a single unified coordinate system. +Deep learning networks, such as the Convolutional Neural Network (CNN)-based +VoxelMorph, Vision Transformer (ViT)-based TransMorph, and State Space Model +(SSM)-based MambaMorph, have demonstrated effective performance in this domain. +The recent Visual State Space Model (VMamba), which incorporates a cross-scan +module with SSM, has exhibited promising improvements in modeling global-range +dependencies with efficient computational cost in computer vision tasks. This +paper hereby introduces an exploration of VMamba with image registration, named +VMambaMorph. This novel hybrid VMamba-CNN network is designed specifically for +3D image registration. Utilizing a U-shaped network architecture, VMambaMorph +computes the deformation field based on target and source volumes. The +VMamba-based block with 2D cross-scan module is redesigned for 3D volumetric +feature processing, and a fine-grained feature extraction module is proposed +for high-dimensional feature learning. We validate VMambaMorph using a public +benchmark brain MR-CT registration dataset, comparing its performance against +current state-of-the-art methods. The results indicate that VMambaMorph +achieves competitive registration quality. The code for VMambaMorph is +available on GitHub. + +
+
+
+
+
+ + ☆ LHU-Net: A Light Hybrid U-Net for Cost-Efficient, High-Performance + Volumetric Medical Image Segmentation + + +
+ As a result of the rise of Transformer architectures in medical image +analysis, specifically in the domain of medical image segmentation, a multitude +of hybrid models have been created that merge the advantages of Convolutional +Neural Networks (CNNs) and Transformers. These hybrid models have achieved +notable success by significantly improving segmentation accuracy. Yet, this +progress often comes at the cost of increased model complexity, both in terms +of parameters and computational demand. Moreover, many of these models fail to +consider the crucial interplay between spatial and channel features, which +could further refine and improve segmentation outcomes. To address this, we +introduce LHU-Net, a Light Hybrid U-Net architecture optimized for volumetric +medical image segmentation. LHU-Net is meticulously designed to prioritize +spatial feature analysis in its initial layers before shifting focus to +channel-based features in its deeper layers, ensuring a comprehensive feature +extraction process. Rigorous evaluation across five benchmark datasets - +Synapse, LA, Pancreas, ACDC, and BRaTS 2018 - underscores LHU-Net's superior +performance, showcasing its dual capacity for efficiency and accuracy. Notably, +LHU-Net sets new performance benchmarks, such as attaining a Dice score of +92.66 on the ACDC dataset, while simultaneously reducing parameters by 85% and +quartering the computational load compared to existing state-of-the-art models. +Achieved without any reliance on pre-training, additional data, or model +ensemble, LHU-Net's effectiveness is further evidenced by its state-of-the-art +performance across all evaluated datasets, utilizing fewer than 11 million +parameters. This achievement highlights that balancing computational efficiency +with high accuracy in medical image segmentation is feasible. Our +implementation of LHU-Net is freely accessible to the research community on +GitHub. + +
+
+
+
+
+ + ♻ ☆ PIGEON: Predicting Image Geolocations + + +
+ Planet-scale image geolocalization remains a challenging problem due to the +diversity of images originating from anywhere in the world. Although approaches +based on vision transformers have made significant progress in geolocalization +accuracy, success in prior literature is constrained to narrow distributions of +images of landmarks, and performance has not generalized to unseen places. We +present a new geolocalization system that combines semantic geocell creation, +multi-task contrastive pretraining, and a novel loss function. Additionally, +our work is the first to perform retrieval over location clusters for guess +refinements. We train two models for evaluations on street-level data and +general-purpose image geolocalization; the first model, PIGEON, is trained on +data from the game of Geoguessr and is capable of placing over 40% of its +guesses within 25 kilometers of the target location globally. We also develop a +bot and deploy PIGEON in a blind experiment against humans, ranking in the top +0.01% of players. We further challenge one of the world's foremost professional +Geoguessr players to a series of six matches with millions of viewers, winning +all six games. Our second model, PIGEOTTO, differs in that it is trained on a +dataset of images from Flickr and Wikipedia, achieving state-of-the-art results +on a wide range of image geolocalization benchmarks, outperforming the previous +SOTA by up to 7.7 percentage points on the city accuracy level and up to 38.8 +percentage points on the country level. Our findings suggest that PIGEOTTO is +the first image geolocalization model that effectively generalizes to unseen +places and that our approach can pave the way for highly accurate, planet-scale +image geolocalization systems. Our code is available on GitHub. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ MMSFormer: Multimodal Transformer for Material and Semantic Segmentation + + +
+ Leveraging information across diverse modalities is known to enhance +performance on multimodal segmentation tasks. However, effectively fusing +information from different modalities remains challenging due to the unique +characteristics of each modality. In this paper, we propose a novel fusion +strategy that can effectively fuse information from different modality +combinations. We also propose a new model named Multi-Modal Segmentation +TransFormer (MMSFormer) that incorporates the proposed fusion strategy to +perform multimodal material and semantic segmentation tasks. MMSFormer +outperforms current state-of-the-art models on three different datasets. As we +begin with only one input modality, performance improves progressively as +additional modalities are incorporated, showcasing the effectiveness of the +fusion block in combining useful information from diverse input modalities. +Ablation studies show that different modules in the fusion block are crucial +for overall model performance. Furthermore, our ablation studies also highlight +the capacity of different input modalities to improve performance in the +identification of different types of materials. The code and pretrained models +will be made available at https://github.com/csiplab/MMSFormer. + +
+
+ comment: Accepted by IEEE Open Journal of Signal Processing. 15 pages, 3 + figures, 9 tables +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 16 + +
+
+
+ + ☆ EAGLE: The First Event Camera Dataset Gathered by an Agile Quadruped + Robot + + +
+ When legged robots perform agile movements, traditional RGB cameras often +produce blurred images, posing a challenge for accurate state estimation. Event +cameras, inspired by biological vision mechanisms, have emerged as a promising +solution for capturing high-speed movements and coping with challenging +lighting conditions, owing to their significant advantages, such as low +latency, high temporal resolution, and a high dynamic range. However, the +integration of event cameras into agile-legged robots is still largely +unexplored. Notably, no event camera-based dataset has yet been specifically +developed for dynamic legged robots. To bridge this gap, we introduce EAGLE +(Event dataset of an AGile LEgged robot), a new dataset comprising data from an +event camera, an RGB-D camera, an IMU, a LiDAR, and joint angle encoders, all +mounted on a quadruped robotic platform. This dataset features more than 100 +sequences from real-world environments, encompassing various indoor and outdoor +environments, different lighting conditions, a range of robot gaits (e.g., +trotting, bounding, pronking), as well as acrobatic movements such as +backflipping. To our knowledge, this is the first event camera dataset to +include multi-sensory data collected by an agile quadruped robot. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera + Fusion for Colorizing Point Clouds + + +
+ A Colored point cloud, as a simple and efficient 3D representation, has many +advantages in various fields, including robotic navigation and scene +reconstruction. This representation is now commonly used in 3D reconstruction +tasks relying on cameras and LiDARs. However, fusing data from these two types +of sensors is poorly performed in many existing frameworks, leading to +unsatisfactory mapping results, mainly due to inaccurate camera poses. This +paper presents OmniColor, a novel and efficient algorithm to colorize point +clouds using an independent 360-degree camera. Given a LiDAR-based point cloud +and a sequence of panorama images with initial coarse camera poses, our +objective is to jointly optimize the poses of all frames for mapping images +onto geometric reconstructions. Our pipeline works in an off-the-shelf manner +that does not require any feature extraction or matching process. Instead, we +find optimal poses by directly maximizing the photometric consistency of LiDAR +maps. In experiments, we show that our method can overcome the severe visual +distortion of omnidirectional images and greatly benefit from the wide field of +view (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy +and stability. The code will be released at +https://github.com/liubonan123/OmniColor/. + +
+
+ comment: 2024 IEEE International Conference on Robotics and Automation +
+
+
+
+
+ + ☆ Compositional Conservatism: A Transductive Approach in Offline + Reinforcement Learning ICLR 2024 + + +
+ Offline reinforcement learning (RL) is a compelling framework for learning +optimal policies from past experiences without additional interaction with the +environment. Nevertheless, offline RL inevitably faces the problem of +distributional shifts, where the states and actions encountered during policy +execution may not be in the training dataset distribution. A common solution +involves incorporating conservatism into the policy or the value function to +safeguard against uncertainties and unknowns. In this work, we focus on +achieving the same objectives of conservatism but from a different perspective. +We propose COmpositional COnservatism with Anchor-seeking (COCOA) for offline +RL, an approach that pursues conservatism in a compositional manner on top of +the transductive reparameterization (Netanyahu et al., 2023), which decomposes +the input variable (the state in our case) into an anchor and its difference +from the original input. Our COCOA seeks both in-distribution anchors and +differences by utilizing the learned reverse dynamics model, encouraging +conservatism in the compositional input space for the policy or value function. +Such compositional conservatism is independent of and agnostic to the prevalent +behavioral conservatism in offline RL. We apply COCOA to four state-of-the-art +offline RL algorithms and evaluate them on the D4RL benchmark, where COCOA +generally improves the performance of each algorithm. The code is available at +https://github.com/runamu/compositional-conservatism. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ Salient Sparse Visual Odometry With Pose-Only Supervision + + +
+ Visual Odometry (VO) is vital for the navigation of autonomous systems, +providing accurate position and orientation estimates at reasonable costs. +While traditional VO methods excel in some conditions, they struggle with +challenges like variable lighting and motion blur. Deep learning-based VO, +though more adaptable, can face generalization problems in new environments. +Addressing these drawbacks, this paper presents a novel hybrid visual odometry +(VO) framework that leverages pose-only supervision, offering a balanced +solution between robustness and the need for extensive labeling. We propose two +cost-effective and innovative designs: a self-supervised homographic +pre-training for enhancing optical flow learning from pose-only labels and a +random patch-based salient point detection strategy for more accurate optical +flow patch extraction. These designs eliminate the need for dense optical flow +labels for training and significantly improve the generalization capability of +the system in diverse and challenging environments. Our pose-only supervised +method achieves competitive performance on standard datasets and greater +robustness and generalization ability in extreme and unseen scenarios, even +compared to dense optical flow-supervised state-of-the-art methods. + +
+
+ comment: Accepted by IEEE Robotics and Automation Letters +
+
+
+
+
+ + ☆ HawkDrive: A Transformer-driven Visual Perception System for Autonomous + Driving in Night Scene + + +
+ Many established vision perception systems for autonomous driving scenarios +ignore the influence of light conditions, one of the key elements for driving +safety. To address this problem, we present HawkDrive, a novel perception +system with hardware and software solutions. Hardware that utilizes stereo +vision perception, which has been demonstrated to be a more reliable way of +estimating depth information than monocular vision, is partnered with the edge +computing device Nvidia Jetson Xavier AGX. Our software for low light +enhancement, depth estimation, and semantic segmentation tasks, is a +transformer-based neural network. Our software stack, which enables fast +inference and noise reduction, is packaged into system modules in Robot +Operating System 2 (ROS2). Our experimental results have shown that the +proposed end-to-end system is effective in improving the depth estimation and +semantic segmentation performance. Our dataset and codes will be released at +https://github.com/ZionGo6/HawkDrive. + +
+
+ comment: Accepted by IEEE IV 2024 +
+
+
+
+
+ + ☆ Constrained 6-DoF Grasp Generation on Complex Shapes for Improved + Dual-Arm Manipulation + + +
+ Efficiently generating grasp poses tailored to specific regions of an object +is vital for various robotic manipulation tasks, especially in a dual-arm +setup. This scenario presents a significant challenge due to the complex +geometries involved, requiring a deep understanding of the local geometry to +generate grasps efficiently on the specified constrained regions. Existing +methods only explore settings involving table-top/small objects and require +augmented datasets to train, limiting their performance on complex objects. We +propose CGDF: Constrained Grasp Diffusion Fields, a diffusion-based grasp +generative model that generalizes to objects with arbitrary geometries, as well +as generates dense grasps on the target regions. CGDF uses a part-guided +diffusion approach that enables it to get high sample efficiency in constrained +grasping without explicitly training on massive constraint-augmented datasets. +We provide qualitative and quantitative comparisons using analytical metrics +and in simulation, in both unconstrained and constrained settings to show that +our method can generalize to generate stable grasps on complex objects, +especially useful for dual-arm manipulation settings, while existing methods +struggle to do so. + +
+
+ comment: Project Page: https://constrained-grasp-diffusion.github.io/ +
+
+
+
+
+ + ☆ ars548_ros. An ARS 548 RDI radar driver for ROS2 + + +
+ The ARS 548 RDI Radar is a premium model of the fifth generation of 77 GHz +long range radar sensors with new RF antenna arrays, which offer digital beam +forming. This radar measures independently the distance, speed and angle of +objects without any reflectors in one measurement cycle based on Pulse +Compression with New Frequency Modulation [1]. Unfortunately, there were not +any drivers available for Linux systems to make the user able to analyze the +data acquired from this sensor to the best of our knowledge. In this paper, we +present a driver that is able to interpret the data from the ARS 548 RDI sensor +and produce data in Robot Operation System version 2 (ROS2). Thus, this data +can be stored, represented and analyzed by using the powerful tools offered by +ROS2. Besides, our driver offers advanced object features provided by the +sensor, such as relative estimated velocity and acceleration of each object, +its orientation and angular velocity. We focus on the configuration of the +sensor and the use of our driver and advanced filtering and representation +tools, offering a video tutorial for these purposes. Finally, a dataset +acquired with this sensor and an Ouster OS1-32 LiDAR sensor for baseline +purposes is available, so that the user can check the correctness of our +driver. + +
+
+ comment: 7 pages, 6 figures and 17 references +
+
+
+
+
+ + ☆ Self-organizing Multiagent Target Enclosing under Limited Information + and Safety Guarantees + + +
+ This paper introduces an approach to address the target enclosing problem +using non-holonomic multiagent systems, where agents autonomously self-organize +themselves in the desired formation around a fixed target. Our approach +combines global enclosing behavior and local collision avoidance mechanisms by +devising a novel potential function and sliding manifold. In our approach, +agents independently move toward the desired enclosing geometry when apart and +activate the collision avoidance mechanism when a collision is imminent, +thereby guaranteeing inter-agent safety. We rigorously show that an agent does +not need to ensure safety with every other agent and put forth a concept of the +nearest colliding agent (for any arbitrary agent) with whom ensuring safety is +sufficient to avoid collisions in the entire swarm. The proposed control +eliminates the need for a fixed or pre-established agent arrangement around the +target and requires only relative information between an agent and the target. +This makes our design particularly appealing for scenarios with limited global +information, hence significantly reducing communication requirements. We +finally present simulation results to vindicate the efficacy of the proposed +method. + +
+
+
+
+
+ + ☆ Automated Lane Change Behavior Prediction and Environmental Perception + Based on SLAM Technology + + +
+ In addition to environmental perception sensors such as cameras, radars, etc. +in the automatic driving system, the external environment of the vehicle is +perceived, in fact, there is also a perception sensor that has been silently +dedicated in the system, that is, the positioning module. This paper explores +the application of SLAM (Simultaneous Localization and Mapping) technology in +the context of automatic lane change behavior prediction and environment +perception for autonomous vehicles. It discusses the limitations of traditional +positioning methods, introduces SLAM technology, and compares LIDAR SLAM with +visual SLAM. Real-world examples from companies like Tesla, Waymo, and Mobileye +showcase the integration of AI-driven technologies, sensor fusion, and SLAM in +autonomous driving systems. The paper then delves into the specifics of SLAM +algorithms, sensor technologies, and the importance of automatic lane changes +in driving safety and efficiency. It highlights Tesla's recent update to its +Autopilot system, which incorporates automatic lane change functionality using +SLAM technology. The paper concludes by emphasizing the crucial role of SLAM in +enabling accurate environment perception, positioning, and decision-making for +autonomous vehicles, ultimately enhancing safety and driving experience. + +
+
+
+
+
+ + ♻ ☆ Which One? Leveraging Context Between Objects and Multiple Views for + Language Grounding + + +
+ When connecting objects and their language referents in an embodied 3D +environment, it is important to note that: (1) an object can be better +characterized by leveraging comparative information between itself and other +objects, and (2) an object's appearance can vary with camera position. As such, +we present the Multi-view Approach to Grounding in Context (MAGiC), which +selects an object referent based on language that distinguishes between two +similar objects. By pragmatically reasoning over both objects and across +multiple views of those objects, MAGiC improves over the state-of-the-art model +on the SNARE object reference task with a relative error reduction of 12.9\% +(representing an absolute improvement of 2.7\%). Ablation studies show that +reasoning jointly over object referent candidates and multiple views of each +object both contribute to improved accuracy. Code: +https://github.com/rcorona/magic_snare/ + +
+
+
+
+
+ + ♻ ☆ Spatial Assisted Human-Drone Collaborative Navigation and Interaction + through Immersive Mixed Reality ICRA + + +
+ Aerial robots have the potential to play a crucial role in assisting humans +with complex and dangerous tasks. Nevertheless, the future industry demands +innovative solutions to streamline the interaction process between humans and +drones to enable seamless collaboration and efficient co-working. In this +paper, we present a novel tele-immersive framework that promotes cognitive and +physical collaboration between humans and robots through Mixed Reality (MR). +This framework incorporates a novel bi-directional spatial awareness and a +multi-modal virtual-physical interaction approaches. The former seamlessly +integrates the physical and virtual worlds, offering bidirectional egocentric +and exocentric environmental representations. The latter, leveraging the +proposed spatial representation, further enhances the collaboration combining a +robot planning algorithm for obstacle avoidance with a variable admittance +control. This allows users to issue commands based on virtual forces while +maintaining compatibility with the environment map. We validate the proposed +approach by performing several collaborative planning and exploration tasks +involving a drone and an user equipped with a MR headset. + +
+
+ comment: Currently Accepted at International Conference on Robotics and + Automation (ICRA) 2024, Nominated as Finalist for IEEE ICRA 2024 Best Paper + Award on Unmanned Aerial Vehicles +
+
+
+
+
+ + ♻ ☆ Comprehensive Robotic Cholecystectomy Dataset (CRCD): Integrating + Kinematics, Pedal Signals, and Endoscopic Videos + + +
+ In recent years, the potential applications of machine learning to Minimally +Invasive Surgery (MIS) have spurred interest in data sets that can be used to +develop data-driven tools. This paper introduces a novel dataset recorded +during ex vivo pseudo-cholecystectomy procedures on pig livers, utilizing the +da Vinci Research Kit (dVRK). Unlike current datasets, ours bridges a critical +gap by offering not only full kinematic data but also capturing all pedal +inputs used during the procedure and providing a time-stamped record of the +endoscope's movements. Contributed by seven surgeons, this data set introduces +a new dimension to surgical robotics research, allowing the creation of +advanced models for automating console functionalities. Our work addresses the +existing limitation of incomplete recordings and imprecise kinematic data, +common in other datasets. By introducing two models, dedicated to predicting +clutch usage and camera activation, we highlight the dataset's potential for +advancing automation in surgical robotics. The comparison of methodologies and +time windows provides insights into the models' boundaries and limitations. + +
+
+ comment: 6 pages, 8 figures, 5 tables. Accepted for presentation at the 2024 + International Symposium on Medical Robotics +
+
+
+
+
+ + ♻ ☆ Probabilistic Uncertainty Quantification of Prediction Models with + Application to Visual Localization ICRA2023 + + +
+ The uncertainty quantification of prediction models (e.g., neural networks) +is crucial for their adoption in many robotics applications. This is arguably +as important as making accurate predictions, especially for safety-critical +applications such as self-driving cars. This paper proposes our approach to +uncertainty quantification in the context of visual localization for autonomous +driving, where we predict locations from images. Our proposed framework +estimates probabilistic uncertainty by creating a sensor error model that maps +an internal output of the prediction model to the uncertainty. The sensor error +model is created using multiple image databases of visual localization, each +with ground-truth location. We demonstrate the accuracy of our uncertainty +prediction framework using the Ithaca365 dataset, which includes variations in +lighting, weather (sunny, snowy, night), and alignment errors between +databases. We analyze both the predicted uncertainty and its incorporation into +a Kalman-based localization filter. Our results show that prediction error +variations increase with poor weather and lighting condition, leading to +greater uncertainty and outliers, which can be predicted by our proposed +uncertainty model. Additionally, our probabilistic error model enables the +filter to remove ad hoc sensor gating, as the uncertainty automatically adjusts +the model to the input data + +
+
+ comment: Extended version of our ICRA2023 paper +
+
+
+
+
+ + ♻ ☆ Learning-Based Modeling of Human-Autonomous Vehicle Interaction for + Improved Safety in Mixed-Vehicle Platooning Control + + +
+ The rising presence of autonomous vehicles (AVs) on public roads necessitates +the development of advanced control strategies that account for the +unpredictable nature of human-driven vehicles (HVs). This study introduces a +learning-based method for modeling HV behavior, combining a traditional +first-principles approach with a Gaussian process (GP) learning component. This +hybrid model enhances the accuracy of velocity predictions and provides +measurable uncertainty estimates. We leverage this model to develop a GP-based +model predictive control (GP-MPC) strategy to improve safety in mixed vehicle +platoons by integrating uncertainty assessments into distance constraints. +Comparative simulations between our GP-MPC approach and a conventional model +predictive control (MPC) strategy reveal that the GP-MPC ensures safer +distancing and more efficient travel within the mixed platoon. By incorporating +sparse GP modeling for HVs and a dynamic GP prediction in MPC, we significantly +reduce the computation time of GP-MPC, making it only marginally longer than +standard MPC and approximately 100 times faster than previous models not +employing these techniques. Our findings underscore the effectiveness of +learning-based HV modeling in enhancing safety and efficiency in mixed-traffic +environments involving AV and HV interactions. + +
+
+
+
+
+ + ♻ ☆ Task-Oriented Dexterous Hand Pose Synthesis Using Differentiable Grasp + Wrench Boundary Estimator + + +
+ This work tackles the problem of task-oriented dexterous hand pose synthesis, +which involves generating a static hand pose capable of applying a +task-specific set of wrenches to manipulate objects. Unlike previous approaches +that focus solely on force-closure grasps, which are unsuitable for +non-prehensile manipulation tasks (\textit{e.g.}, turning a knob or pressing a +button), we introduce a unified framework covering force-closure grasps, +non-force-closure grasps, and a variety of non-prehensile poses. Our key idea +is a novel optimization objective quantifying the disparity between the Task +Wrench Space (TWS, the desired wrenches predefined as a task prior) and the +Grasp Wrench Space (GWS, the achievable wrenches computed from the current hand +pose). By minimizing this objective, gradient-based optimization algorithms can +synthesize task-oriented hand poses without additional human demonstrations. +Our specific contributions include 1) a fast, accurate, and differentiable +technique for estimating the GWS boundary; 2) a task-oriented objective +function based on the disparity between the estimated GWS boundary and the +provided TWS boundary; and 3) an efficient implementation of the synthesis +pipeline that leverages CUDA accelerations and supports large-scale +paralleling. Experimental results on 10 diverse tasks demonstrate a 72.6\% +success rate in simulation. Furthermore, real-world validation for 4 tasks +confirms the effectiveness of synthesized poses for manipulation. Notably, +despite being primarily tailored for task-oriented hand pose synthesis, our +pipeline can generate force-closure grasps 50 times faster than DexGraspNet +while maintaining comparable grasp quality. Project page: +https://pku-epic.github.io/TaskDexGrasp/. + +
+
+
+
+
+ + ♻ ☆ Natural Language as Policies: Reasoning for Coordinate-Level Embodied + Control with LLMs + + +
+ We demonstrate experimental results with LLMs that address robotics task +planning problems. Recently, LLMs have been applied in robotics task planning, +particularly using a code generation approach that converts complex high-level +instructions into mid-level policy codes. In contrast, our approach acquires +text descriptions of the task and scene objects, then formulates task planning +through natural language reasoning, and outputs coordinate level control +commands, thus reducing the necessity for intermediate representation code as +policies with pre-defined APIs. Our approach is evaluated on a multi-modal +prompt simulation benchmark, demonstrating that our prompt engineering +experiments with natural language reasoning significantly enhance success rates +compared to its absence. Furthermore, our approach illustrates the potential +for natural language descriptions to transfer robotics skills from known tasks +to previously unseen tasks. The project website: +https://natural-language-as-policies.github.io/ + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 39 + +
+
+
+ + ☆ Growing Q-Networks: Solving Continuous Control Tasks with Adaptive + Control Resolution + + +
+ Recent reinforcement learning approaches have shown surprisingly strong +capabilities of bang-bang policies for solving continuous control benchmarks. +The underlying coarse action space discretizations often yield favourable +exploration characteristics while final performance does not visibly suffer in +the absence of action penalization in line with optimal control theory. In +robotics applications, smooth control signals are commonly preferred to reduce +system wear and energy efficiency, but action costs can be detrimental to +exploration during early training. In this work, we aim to bridge this +performance gap by growing discrete action spaces from coarse to fine control +resolution, taking advantage of recent results in decoupled Q-learning to scale +our approach to high-dimensional action spaces up to dim(A) = 38. Our work +indicates that an adaptive control resolution in combination with value +decomposition yields simple critic-only algorithms that yield surprisingly +strong performance on continuous control tasks. + +
+
+
+
+
+ + ☆ Humanoid Robots at work: where are we ? + + +
+ Launched by Elon Musk and its Optimus, we are witnessing a new race in which +many companies have already engaged. The objective it to put at work a new +generation of humanoid robots in demanding industrial environments within 2 or +3 years. Is this objective realistic ? The aim of this document and its main +contributions is to provide some hints by covering the following topics: First +an analysis of 12 companies based on eight criteria that will help us to +distinguish companies based on their maturity and approach to the market; +second as these humanoids are very complex systems we will provide an overview +of the technological challenges to be addressed; third when humanoids are +deployed at scale, Operation and Maintenance become critical and the we will +explore what is new with these complex machines; Finally Pilots are the last +step to test the feasibility of a new system before mass deployment. This is an +important step to test the maturity of a product and the strategy of the +humanoid supplier to address a market and two pragmatic approaches will be +discussed. + +
+
+ comment: 30 pages 16 figures +
+
+
+
+
+ + ☆ Modeling Kinematic Uncertainty of Tendon-Driven Continuum Robots via + Mixture Density Networks + + +
+ Tendon-driven continuum robot kinematic models are frequently computationally +expensive, inaccurate due to unmodeled effects, or both. In particular, +unmodeled effects produce uncertainties that arise during the robot's operation +that lead to variability in the resulting geometry. We propose a novel solution +to these issues through the development of a Gaussian mixture kinematic model. +We train a mixture density network to output a Gaussian mixture model +representation of the robot geometry given the current tendon displacements. +This model computes a probability distribution that is more representative of +the true distribution of geometries at a given configuration than a model that +outputs a single geometry, while also reducing the computation time. We +demonstrate one use of this model through a trajectory optimization method that +explicitly reasons about the workspace uncertainty to minimize the probability +of collision. + +
+
+
+
+
+ + ☆ Multi-modal perception for soft robotic interactions using generative + models + + +
+ Perception is essential for the active interaction of physical agents with +the external environment. The integration of multiple sensory modalities, such +as touch and vision, enhances this perceptual process, creating a more +comprehensive and robust understanding of the world. Such fusion is +particularly useful for highly deformable bodies such as soft robots. +Developing a compact, yet comprehensive state representation from multi-sensory +inputs can pave the way for the development of complex control strategies. This +paper introduces a perception model that harmonizes data from diverse +modalities to build a holistic state representation and assimilate essential +information. The model relies on the causality between sensory input and +robotic actions, employing a generative model to efficiently compress fused +information and predict the next observation. We present, for the first time, a +study on how touch can be predicted from vision and proprioception on soft +robots, the importance of the cross-modal generation and why this is essential +for soft robotic interactions in unstructured environments. + +
+
+ comment: Accepted for presentation at IEEE RoboSoft 2024 +
+
+
+
+
+ + ☆ Continual Policy Distillation of Reinforcement Learning-based + Controllers for Soft Robotic In-Hand Manipulation + + +
+ Dexterous manipulation, often facilitated by multi-fingered robotic hands, +holds solid impact for real-world applications. Soft robotic hands, due to +their compliant nature, offer flexibility and adaptability during object +grasping and manipulation. Yet, benefits come with challenges, particularly in +the control development for finger coordination. Reinforcement Learning (RL) +can be employed to train object-specific in-hand manipulation policies, but +limiting adaptability and generalizability. We introduce a Continual Policy +Distillation (CPD) framework to acquire a versatile controller for in-hand +manipulation, to rotate different objects in shape and size within a +four-fingered soft gripper. The framework leverages Policy Distillation (PD) to +transfer knowledge from expert policies to a continually evolving student +policy network. Exemplar-based rehearsal methods are then integrated to +mitigate catastrophic forgetting and enhance generalization. The performance of +the CPD framework over various replay strategies demonstrates its effectiveness +in consolidating knowledge from multiple experts and achieving versatile and +adaptive behaviours for in-hand manipulation tasks. + +
+
+ comment: Accepted for presentation at IEEE RoboSoft 2024 +
+
+
+
+
+ + ☆ Convex MPC and Thrust Allocation with Deadband for Spacecraft Rendezvous + + +
+ This paper delves into a rendezvous scenario involving a chaser and a target +spacecraft, focusing on the application of Model Predictive Control (MPC) to +design a controller capable of guiding the chaser toward the target. The +operational principle of spacecraft thrusters, requiring a minimum activation +time that leads to the existence of a control deadband, introduces +mixed-integer constraints into the optimization, posing a considerable +computational challenge due to the exponential complexity on the number of +integer constraints. We address this complexity by presenting two solver +algorithms that efficiently approximate the optimal solution in significantly +less time than standard solvers, making them well-suited for real-time +applications. + +
+
+
+
+
+ + ☆ ToolEENet: Tool Affordance 6D Pose Estimation + + +
+ The exploration of robotic dexterous hands utilizing tools has recently +attracted considerable attention. A significant challenge in this field is the +precise awareness of a tool's pose when grasped, as occlusion by the hand often +degrades the quality of the estimation. Additionally, the tool's overall pose +often fails to accurately represent the contact interaction, thereby limiting +the effectiveness of vision-guided, contact-dependent activities. To overcome +this limitation, we present the innovative TOOLEE dataset, which, to the best +of our knowledge, is the first to feature affordance segmentation of a tool's +end-effector (EE) along with its defined 6D pose based on its usage. +Furthermore, we propose the ToolEENet framework for accurate 6D pose estimation +of the tool's EE. This framework begins by segmenting the tool's EE from raw +RGBD data, then uses a diffusion model-based pose estimator for 6D pose +estimation at a category-specific level. Addressing the issue of symmetry in +pose estimation, we introduce a symmetry-aware pose representation that +enhances the consistency of pose estimation. Our approach excels in this field, +demonstrating high levels of precision and generalization. Furthermore, it +shows great promise for application in contact-based manipulation scenarios. +All data and codes are available on the project website: +https://yuyangtu.github.io/projectToolEENet.html + +
+
+
+
+
+ + ☆ Probabilistically Informed Robot Object Search with Multiple Regions + + +
+ The increasing use of autonomous robot systems in hazardous environments +underscores the need for efficient search and rescue operations. Despite +significant advancements, existing literature on object search often falls +short in overcoming the difficulty of long planning horizons and dealing with +sensor limitations, such as noise. This study introduces a novel approach that +formulates the search problem as a belief Markov decision processes with +options (BMDP-O) to make Monte Carlo tree search (MCTS) a viable tool for +overcoming these challenges in large scale environments. The proposed +formulation incorporates sequences of actions (options) to move between regions +of interest, enabling the algorithm to efficiently scale to large environments. +This approach also enables the use of customizable fields of view, for use with +multiple types of sensors. Experimental results demonstrate the superiority of +this approach in large environments when compared to the problem without +options and alternative tools such as receding horizon planners. Given compute +time for the proposed formulation is relatively high, a further approximated +"lite" formulation is proposed. The lite formulation finds objects in a +comparable number of steps with faster computation. + +
+
+ comment: 6 pages, 7 figures. Submitted to the 2024 IEEE/RSJ International + Conference on Intelligent Robots and Systems in Abu Dhabi, UAE (Oct 14-18, + 2024) +
+
+
+
+
+ + ☆ Designing Robots to Help Women + + +
+ Robots are being designed to help people in an increasing variety of +settings--but seemingly little attention has been given so far to the specific +needs of women, who represent roughly half of the world's population but are +highly underrepresented in robotics. Here we used a speculative prototyping +approach to explore this expansive design space: First, we identified some +potential challenges of interest, including crimes and illnesses that +disproportionately affect women, as well as potential opportunities for +designers, which were visualized in five sketches. Then, one of the sketched +scenarios was further explored by developing a prototype, of a robotic helper +drone equipped with computer vision to detect hidden cameras that could be used +to spy on women. While object detection introduced some errors, hidden cameras +were identified with a reasonable accuracy of 80\% (Intersection over Union +(IoU) score: 0.40). Our aim is that the identified challenges and opportunities +could help spark discussion and inspire designers, toward realizing a safer, +more inclusive future through responsible use of technology. + +
+
+ comment: 10 pages, submitted 2024-4-5 to SCAI +
+
+
+
+
+ + ☆ Self-Sensing Feedback Control of an Electrohydraulic Robotic Shoulder + + +
+ The human shoulder, with its glenohumeral joint, tendons, ligaments, and +muscles, allows for the execution of complex tasks with precision and +efficiency. However, current robotic shoulder designs lack the compliance and +compactness inherent in their biological counterparts. A major limitation of +these designs is their reliance on external sensors like rotary encoders, which +restrict mechanical joint design and introduce bulk to the system. To address +this constraint, we present a bio-inspired antagonistic robotic shoulder with +two degrees of freedom powered by self-sensing hydraulically amplified +self-healing electrostatic actuators. Our artificial muscle design decouples +the high-voltage electrostatic actuation from the pair of low-voltage +self-sensing electrodes. This approach allows for proprioceptive feedback +control of trajectories in the task space while eliminating the necessity for +any additional sensors. We assess the platform's efficacy by comparing it to a +feedback control based on position data provided by a motion capture system. +The study demonstrates closed-loop controllable robotic manipulators based on +an inherent self-sensing capability of electrohydraulic actuators. The proposed +architecture can serve as a basis for complex musculoskeletal joint +arrangements. + +
+
+ comment: 7 pages, 8 figures +
+
+
+
+
+ + ☆ High-Frequency Capacitive Sensing for Electrohydraulic Soft Actuators + + +
+ The need for compliant and proprioceptive actuators has grown more evident in +pursuing more adaptable and versatile robotic systems. Hydraulically Amplified +Self-Healing Electrostatic (HASEL) actuators offer distinctive advantages with +their inherent softness and flexibility, making them promising candidates for +various robotic tasks, including delicate interactions with humans and animals, +biomimetic locomotion, prosthetics, and exoskeletons. This has resulted in a +growing interest in the capacitive self-sensing capabilities of HASEL actuators +to create miniature displacement estimation circuitry that does not require +external sensors. However, achieving HASEL self-sensing for actuation +frequencies above 1 Hz and with miniature high-voltage power supplies has +remained limited. In this paper, we introduce the F-HASEL actuator, which adds +an additional electrode pair used exclusively for capacitive sensing to a +Peano-HASEL actuator. We demonstrate displacement estimation of the F-HASEL +during high-frequency actuation up to 20 Hz and during external loading using +miniaturized circuitry comprised of low-cost off-the-shelf components and a +miniature high-voltage power supply. Finally, we propose a circuitry to +estimate the displacement of multiple F-HASELs and demonstrate it in a wearable +application to track joint rotations of a virtual reality user in real-time. + +
+
+ comment: 8 pages, 11 figures +
+
+
+
+
+ + ☆ Bidirectional Human Interactive AI Framework for Social Robot Navigation ICRA 2024 + + +
+ Trustworthiness is a crucial concept in the context of human-robot +interaction. Cooperative robots must be transparent regarding their +decision-making process, especially when operating in a human-oriented +environment. This paper presents a comprehensive end-to-end framework aimed at +fostering trustworthy bidirectional human-robot interaction in collaborative +environments for the social navigation of mobile robots. Our method enables a +mobile robot to predict the trajectory of people and adjust its route in a +socially-aware manner. In case of conflict between human and robot decisions, +detected through visual examination, the route is dynamically modified based on +human preference while verbal communication is maintained. We present our +pipeline, framework design, and preliminary experiments that form the +foundation of our proposition. + +
+
+ comment: Accepted by Robot Trust for Symbiotic Societies (RTSS) Workshop at + ICRA 2024 +
+
+
+
+
+ + ☆ VoicePilot: Harnessing LLMs as Speech Interfaces for Physically + Assistive Robots + + +
+ Physically assistive robots present an opportunity to significantly increase +the well-being and independence of individuals with motor impairments or other +forms of disability who are unable to complete activities of daily living. +Speech interfaces, especially ones that utilize Large Language Models (LLMs), +can enable individuals to effectively and naturally communicate high-level +commands and nuanced preferences to robots. Frameworks for integrating LLMs as +interfaces to robots for high level task planning and code generation have been +proposed, but fail to incorporate human-centric considerations which are +essential while developing assistive interfaces. In this work, we present a +framework for incorporating LLMs as speech interfaces for physically assistive +robots, constructed iteratively with 3 stages of testing involving a feeding +robot, culminating in an evaluation with 11 older adults at an independent +living facility. We use both quantitative and qualitative data from the final +study to validate our framework and additionally provide design guidelines for +using LLMs as speech interfaces for assistive robots. Videos and supporting +files are located on our project website: +https://sites.google.com/andrew.cmu.edu/voicepilot/ + +
+
+
+
+
+ + ☆ MM-Gaussian: 3D Gaussian-based Multi-modal Fusion for Localization and + Reconstruction in Unbounded Scenes + + +
+ Localization and mapping are critical tasks for various applications such as +autonomous vehicles and robotics. The challenges posed by outdoor environments +present particular complexities due to their unbounded characteristics. In this +work, we present MM-Gaussian, a LiDAR-camera multi-modal fusion system for +localization and mapping in unbounded scenes. Our approach is inspired by the +recently developed 3D Gaussians, which demonstrate remarkable capabilities in +achieving high rendering quality and fast rendering speed. Specifically, our +system fully utilizes the geometric structure information provided by +solid-state LiDAR to address the problem of inaccurate depth encountered when +relying solely on visual solutions in unbounded, outdoor scenarios. +Additionally, we utilize 3D Gaussian point clouds, with the assistance of +pixel-level gradient descent, to fully exploit the color information in photos, +thereby achieving realistic rendering effects. To further bolster the +robustness of our system, we designed a relocalization module, which assists in +returning to the correct trajectory in the event of a localization failure. +Experiments conducted in multiple scenarios demonstrate the effectiveness of +our method. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ Validation of critical maneuvers based on shared control + + +
+ This paper presents the validation of shared control strategies for critical +maneuvers in automated driving systems. Shared control involves collaboration +between the driver and automation, allowing both parties to actively engage and +cooperate at different levels of the driving task. The involvement of the +driver adds complexity to the control loop, necessitating comprehensive +validation methodologies. The proposed approach focuses on two critical +maneuvers: overtaking in low visibility scenarios and lateral evasive actions. +A modular architecture with an arbitration module and shared control algorithms +is implemented, primarily focusing on the lateral control of the vehicle. The +validation is conducted using a dynamic simulator, involving 8 real drivers +interacting with a virtual environment. The results demonstrate improved safety +and user acceptance, indicating the effectiveness of the shared control +strategies in comparison with no shared-control support. Future work involves +implementing shared control in drive-by-wire systems to enhance safety and +driver comfort during critical maneuvers. Overall, this research contributes to +the development and validation of shared control approaches in automated +driving systems. + +
+
+ comment: 8 pages, 19 figures. Published in IEEE 26th International Conference + on Intelligent Transportation Systems (ITSC) +
+
+
+
+
+ + ☆ Towards Safe Robot Use with Edged or Pointed Objects: A Surrogate Study + Assembling a Human Hand Injury Protection Database ICRA 2024 + + +
+ The use of pointed or edged tools or objects is one of the most challenging +aspects of today's application of physical human-robot interaction (pHRI). One +reason for this is that the severity of harm caused by such edged or pointed +impactors is less well studied than for blunt impactors. Consequently, the +standards specify well-reasoned force and pressure thresholds for blunt +impactors and advise avoiding any edges and corners in contacts. Nevertheless, +pointed or edged impactor geometries cannot be completely ruled out in real +pHRI applications. For example, to allow edged or pointed tools such as +screwdrivers near human operators, the knowledge of injury severity needs to be +extended so that robot integrators can perform well-reasoned, time-efficient +risk assessments. In this paper, we provide the initial datasets on injury +prevention for the human hand based on drop tests with surrogates for the human +hand, namely pig claws and chicken drumsticks. We then demonstrate the ease and +efficiency of robot use using the dataset for contact on two examples. Finally, +our experiments provide a set of injuries that may also be expected for human +subjects under certain robot mass-velocity constellations in collisions. To +extend this work, testing on human samples and a collaborative effort from +research institutes worldwide is needed to create a comprehensive human injury +avoidance database for any pHRI scenario and thus for safe pHRI applications +including edged and pointed geometries. + +
+
+ comment: accepted fo presentation at IEEE ICRA 2024 +
+
+
+
+
+ + ☆ POMDP-Guided Active Force-Based Search for Robotic Insertion + + +
+ In robotic insertion tasks where the uncertainty exceeds the allowable +tolerance, a good search strategy is essential for successful insertion and +significantly influences efficiency. The commonly used blind search method is +time-consuming and does not exploit the rich contact information. In this +paper, we propose a novel search strategy that actively utilizes the +information contained in the contact configuration and shows high efficiency. +In particular, we formulate this problem as a Partially Observable Markov +Decision Process (POMDP) with carefully designed primitives based on an +in-depth analysis of the contact configuration's static stability. From the +formulated POMDP, we can derive a novel search strategy. Thanks to its +simplicity, this search strategy can be incorporated into a +Finite-State-Machine (FSM) controller. The behaviors of the FSM controller are +realized through a low-level Cartesian Impedance Controller. Our method is +based purely on the robot's proprioceptive sensing and does not need visual or +tactile sensors. To evaluate the effectiveness of our proposed strategy and +control framework, we conduct extensive comparison experiments in simulation, +where we compare our method with the baseline approach. The results demonstrate +that our proposed method achieves a higher success rate with a shorter search +time and search trajectory length compared to the baseline method. +Additionally, we show that our method is robust to various initial displacement +errors. + +
+
+
+
+
+ + ☆ Towards introspective loop closure in 4D radar SLAM ICRA 2024 + + +
+ Imaging radar is an emerging sensor modality in the context of Localization +and Mapping (SLAM), especially suitable for vision-obstructed environments. +This article investigates the use of 4D imaging radars for SLAM and analyzes +the challenges in robust loop closure. Previous work indicates that 4D radars, +together with inertial measurements, offer ample information for accurate +odometry estimation. However, the low field of view, limited resolution, and +sparse and noisy measurements render loop closure a significantly more +challenging problem. Our work builds on the previous work - TBV SLAM - which +was proposed for robust loop closure with 360$^\circ$ spinning radars. This +article highlights and addresses challenges inherited from a directional 4D +radar, such as sparsity, noise, and reduced field of view, and discusses why +the common definition of a loop closure is unsuitable. By combining multiple +quality measures for accurate loop closure detection adapted to 4D radar data, +significant results in trajectory estimation are achieved; the absolute +trajectory error is as low as 0.46 m over a distance of 1.8 km, with consistent +operation over multiple environments. + +
+
+ comment: Submitted to the workshop "Radar in Robotics: Resilience from Signal + to Navigation" at ICRA 2024 +
+
+
+
+
+ + ☆ Under-Canopy Navigation using Aerial Lidar Maps + + +
+ Autonomous navigation in unstructured natural environments poses a +significant challenge. In goal navigation tasks without prior information, the +limited look-ahead of onboard sensors utilised by robots compromises path +efficiency. We propose a novel approach that leverages an above-the-canopy +aerial map for improved ground robot navigation. Our system utilises aerial +lidar scans to create a 3D probabilistic occupancy map, uniquely incorporating +the uncertainty in the aerial vehicle's trajectory for improved accuracy. Novel +path planning cost functions are introduced, combining path length with +obstruction risk estimated from the probabilistic map. The D-Star Lite +algorithm then calculates an optimal (minimum-cost) path to the goal. This +system also allows for dynamic replanning upon encountering unforeseen +obstacles on the ground. Extensive experiments and ablation studies in +simulated and real forests demonstrate the effectiveness of our system. + +
+
+
+
+
+ + ☆ Can only LLMs do Reasoning?: Potential of Small Language Models in Task + Planning + + +
+ In robotics, the use of Large Language Models (LLMs) is becoming prevalent, +especially for understanding human commands. In particular, LLMs are utilized +as domain-agnostic task planners for high-level human commands. LLMs are +capable of Chain-of-Thought (CoT) reasoning, and this allows LLMs to be task +planners. However, we need to consider that modern robots still struggle to +perform complex actions, and the domains where robots can be deployed are +limited in practice. This leads us to pose a question: If small LMs can be +trained to reason in chains within a single domain, would even small LMs be +good task planners for the robots? To train smaller LMs to reason in chains, we +build `COmmand-STeps datasets' (COST) consisting of high-level commands along +with corresponding actionable low-level steps, via LLMs. We release not only +our datasets but also the prompt templates used to generate them, to allow +anyone to build datasets for their domain. We compare GPT3.5 and GPT4 with the +finetuned GPT2 for task domains, in tabletop and kitchen environments, and the +result shows that GPT2-medium is comparable to GPT3.5 for task planning in a +specific domain. Our dataset, code, and more output samples can be found in +https://github.com/Gawon-Choi/small-LMs-Task-Planning + +
+
+ comment: 8 pages, 11 figures +
+
+
+
+
+ + ☆ Heterogeneous Multi-Agent Reinforcement Learning for Zero-Shot Scalable + Collaboration + + +
+ The rise of multi-agent systems, especially the success of multi-agent +reinforcement learning (MARL), is reshaping our future across diverse domains +like autonomous vehicle networks. However, MARL still faces significant +challenges, particularly in achieving zero-shot scalability, which allows +trained MARL models to be directly applied to unseen tasks with varying numbers +of agents. In addition, real-world multi-agent systems usually contain agents +with different functions and strategies, while the existing scalable MARL +methods only have limited heterogeneity. To address this, we propose a novel +MARL framework named Scalable and Heterogeneous Proximal Policy Optimization +(SHPPO), integrating heterogeneity into parameter-shared PPO-based MARL +networks. we first leverage a latent network to adaptively learn strategy +patterns for each agent. Second, we introduce a heterogeneous layer for +decision-making, whose parameters are specifically generated by the learned +latent variables. Our approach is scalable as all the parameters are shared +except for the heterogeneous layer, and gains both inter-individual and +temporal heterogeneity at the same time. We implement our approach based on the +state-of-the-art backbone PPO-based algorithm as SHPPO, while our approach is +agnostic to the backbone and can be seamlessly plugged into any +parameter-shared MARL method. SHPPO exhibits superior performance over the +baselines such as MAPPO and HAPPO in classic MARL environments like Starcraft +Multi-Agent Challenge (SMAC) and Google Research Football (GRF), showcasing +enhanced zero-shot scalability and offering insights into the learned latent +representation's impact on team performance by visualization. + +
+
+
+
+
+ + ☆ Scaling Motion Forecasting Models with Ensemble Distillation + + +
+ Motion forecasting has become an increasingly critical component of +autonomous robotic systems. Onboard compute budgets typically limit the +accuracy of real-time systems. In this work we propose methods of improving +motion forecasting systems subject to limited compute budgets by combining +model ensemble and distillation techniques. The use of ensembles of deep neural +networks has been shown to improve generalization accuracy in many application +domains. We first demonstrate significant performance gains by creating a large +ensemble of optimized single models. We then develop a generalized framework to +distill motion forecasting model ensembles into small student models which +retain high performance with a fraction of the computing cost. For this study +we focus on the task of motion forecasting using real world data from +autonomous driving systems. We develop ensemble models that are very +competitive on the Waymo Open Motion Dataset (WOMD) and Argoverse leaderboards. +From these ensembles, we train distilled student models which have high +performance at a fraction of the compute costs. These experiments demonstrate +distillation from ensembles as an effective method for improving accuracy of +predictive models for robotic systems with limited compute budgets. + +
+
+ comment: 11 pages, 14 figures +
+
+
+
+
+ + ☆ Hybrid Force Motion Control with Estimated Surface Normal for + Manufacturing Applications + + +
+ This paper proposes a hybrid force-motion framework that utilizes real-time +surface normal updates. The surface normal is estimated via a novel method that +leverages force sensing measurements and velocity commands to compensate the +friction bias. This approach is critical for robust execution of precision +force-controlled tasks in manufacturing, such as thermoplastic tape replacement +that traces surfaces or paths on a workpiece subject to uncertainties deviated +from the model. We formulated the proposed method and implemented the framework +in ROS2 environment. The approach was validated using kinematic simulations and +a hardware platform. Specifically, we demonstrated the approach on a 7-DoF +manipulator equipped with a force/torque sensor at the end-effector. + +
+
+ comment: 8 pages, 21st International Conference on Ubiquitous Robots (UR + 2024), accepted +
+
+
+
+
+ + ☆ Admittance Control for Adaptive Remote Center of Motion in Robotic + Laparoscopic Surgery + + +
+ In laparoscopic robot-assisted minimally invasive surgery, the kinematic +control of the robot is subject to the remote center of motion (RCM) constraint +at the port of entry (e.g., trocar) into the patient's body. During surgery, +after the instrument is inserted through the trocar, intrinsic physiological +movements such as the patient's heartbeat, breathing process, and/or other +purposeful body repositioning may deviate the position of the port of entry. +This can cause a conflict between the registered RCM and the moved port of +entry. + To mitigate this conflict, we seek to utilize the interaction forces at the +RCM. We develop a novel framework that integrates admittance control into a +redundancy resolution method for the RCM kinematic constraint. Using the +force/torque sensory feedback at the base of the instrument driving mechanism +(IDM), the proposed framework estimates the forces at RCM, rejects forces +applied on other locations along the instrument, and uses them in the +admittance controller. In this paper, we report analysis from kinematic +simulations to validate the proposed framework. In addition, a hardware +platform has been completed, and future work is planned for experimental +validation. + +
+
+ comment: 7 pages, 21st International Conference on Ubiquitous Robots (UR + 2024), accepted +
+
+
+
+
+ + ☆ A Ground Mobile Robot for Autonomous Terrestrial Laser Scanning-Based + Field Phenotyping + + +
+ Traditional field phenotyping methods are often manual, time-consuming, and +destructive, posing a challenge for breeding progress. To address this +bottleneck, robotics and automation technologies offer efficient sensing tools +to monitor field evolution and crop development throughout the season. This +study aimed to develop an autonomous ground robotic system for LiDAR-based +field phenotyping in plant breeding trials. A Husky platform was equipped with +a high-resolution three-dimensional (3D) laser scanner to collect in-field +terrestrial laser scanning (TLS) data without human intervention. To automate +the TLS process, a 3D ray casting analysis was implemented for optimal TLS site +planning, and a route optimization algorithm was utilized to minimize travel +distance during data collection. The platform was deployed in two cotton +breeding fields for evaluation, where it autonomously collected TLS data. The +system provided accurate pose information through RTK-GNSS positioning and +sensor fusion techniques, with average errors of less than 0.6 cm for location +and 0.38$^{\circ}$ for heading. The achieved localization accuracy allowed +point cloud registration with mean point errors of approximately 2 cm, +comparable to traditional TLS methods that rely on artificial targets and +manual sensor deployment. This work presents an autonomous phenotyping platform +that facilitates the quantitative assessment of plant traits under field +conditions of both large agricultural fields and small breeding trials to +contribute to the advancement of plant phenomics and breeding programs. + +
+
+ comment: Submitted to Journal of Field Robotics +
+
+
+
+
+ + ☆ LOSS-SLAM: Lightweight Open-Set Semantic Simultaneous Localization and + Mapping + + +
+ Enabling robots to understand the world in terms of objects is a critical +building block towards higher level autonomy. The success of foundation models +in vision has created the ability to segment and identify nearly all objects in +the world. However, utilizing such objects to localize the robot and build an +open-set semantic map of the world remains an open research question. In this +work, a system of identifying, localizing, and encoding objects is tightly +coupled with probabilistic graphical models for performing open-set semantic +simultaneous localization and mapping (SLAM). Results are presented +demonstrating that the proposed lightweight object encoding can be used to +perform more accurate object-based SLAM than existing open-set methods, +closed-set methods, and geometric methods while incurring a lower computational +overhead than existing open-set mapping methods. + +
+
+
+
+
+ + ♻ ☆ CenterGrasp: Object-Aware Implicit Representation Learning for + Simultaneous Shape Reconstruction and 6-DoF Grasp Estimation RA-L + + +
+ Reliable object grasping is a crucial capability for autonomous robots. +However, many existing grasping approaches focus on general clutter removal +without explicitly modeling objects and thus only relying on the visible local +geometry. We introduce CenterGrasp, a novel framework that combines object +awareness and holistic grasping. CenterGrasp learns a general object prior by +encoding shapes and valid grasps in a continuous latent space. It consists of +an RGB-D image encoder that leverages recent advances to detect objects and +infer their pose and latent code, and a decoder to predict shape and grasps for +each object in the scene. We perform extensive experiments on simulated as well +as real-world cluttered scenes and demonstrate strong scene reconstruction and +6-DoF grasp-pose estimation performance. Compared to the state of the art, +CenterGrasp achieves an improvement of 38.5 mm in shape reconstruction and 33 +percentage points on average in grasp success. We make the code and trained +models publicly available at http://centergrasp.cs.uni-freiburg.de. + +
+
+ comment: Accepted at RA-L. Video, code and models available at + http://centergrasp.cs.uni-freiburg.de +
+
+
+
+
+ + ♻ ☆ Advancements in Radar Odometry + + +
+ Radar odometry estimation has emerged as a critical technique in the field of +autonomous navigation, providing robust and reliable motion estimation under +various environmental conditions. Despite its potential, the complex nature of +radar signals and the inherent challenges associated with processing these +signals have limited the widespread adoption of this technology. This paper +aims to address these challenges by proposing novel improvements to an existing +method for radar odometry estimation, designed to enhance accuracy and +reliability in diverse scenarios. Our pipeline consists of filtering, motion +compensation, oriented surface points computation, smoothing, one-to-many radar +scan registration, and pose refinement. The developed method enforces local +understanding of the scene, by adding additional information through smoothing +techniques, and alignment of consecutive scans, as a refinement posterior to +the one-to-many registration. We present an in-depth investigation of the +contribution of each improvement to the localization accuracy, and we benchmark +our system on the sequences of the main datasets for radar understanding, i.e., +the Oxford Radar RobotCar, MulRan, and Boreas datasets. The proposed pipeline +is able to achieve superior results, on all scenarios considered and under +harsh environmental constraints. + +
+
+
+
+
+ + ♻ ☆ Controlling the Cascade: Kinematic Planning for N-ball Toss Juggling + + +
+ Dynamic movements are ubiquitous in human motor behavior as they tend to be +more efficient and can solve a broader range of skill domains than their +quasi-static counterparts. For decades, robotic juggling tasks have been among +the most frequently studied dynamic manipulation problems since the required +dynamic dexterity can be scaled to arbitrarily high difficulty. However, +successful approaches have been limited to basic juggling skills, indicating a +lack of understanding of the required constraints for dexterous toss juggling. +We present a detailed analysis of the toss juggling task, identifying the key +challenges and formalizing it as a trajectory optimization problem. Building on +our state-of-the-art, real-world toss juggling platform, we reach the +theoretical limits of toss juggling in simulation, evaluate a resulting +real-time controller in environments of varying difficulty and achieve robust +toss juggling of up to 17 balls on two anthropomorphic manipulators. + +
+
+
+
+
+ + ♻ ☆ A Methodology to Study the Impact of Spiking Neural Network Parameters + considering Event-Based Automotive Data + + +
+ Autonomous Driving (AD) systems are considered as the future of human +mobility and transportation. Solving computer vision tasks such as image +classification and object detection/segmentation, with high accuracy and low +power/energy consumption, is highly needed to realize AD systems in real life. +These requirements can potentially be satisfied by Spiking Neural Networks +(SNNs). However, the state-of-the-art works in SNN-based AD systems still focus +on proposing network models that can achieve high accuracy, and they have not +systematically studied the roles of SNN parameters when used for learning +event-based automotive data. Therefore, we still lack understanding of how to +effectively develop SNN models for AD systems. Toward this, we propose a novel +methodology to systematically study and analyze the impact of SNN parameters +considering event-based automotive data, then leverage this analysis for +enhancing SNN developments. To do this, we first explore different settings of +SNN parameters that directly affect the learning mechanism (i.e., batch size, +learning rate, neuron threshold potential, and weight decay), then analyze the +accuracy results. Afterward, we propose techniques that jointly improve SNN +accuracy and reduce training time. Experimental results show that our +methodology can improve the SNN models for AD systems than the +state-of-the-art, as it achieves higher accuracy (i.e., 86%) for the NCARS +dataset, and it can also achieve iso-accuracy (i.e., ~85% with standard +deviation less than 0.5%) while speeding up the training time by 1.9x. In this +manner, our research work provides a set of guidelines for SNN parameter +enhancements, thereby enabling the practical developments of SNN-based AD +systems. + +
+
+ comment: 7 pages, 13 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Evaluating Pedestrian Trajectory Prediction Methods with Respect to + Autonomous Driving + + +
+ In this paper, we assess the state of the art in pedestrian trajectory +prediction within the context of generating single trajectories, a critical +aspect aligning with the requirements in autonomous systems. The evaluation is +conducted on the widely-used ETH/UCY dataset where the Average Displacement +Error (ADE) and the Final Displacement Error (FDE) are reported. Alongside +this, we perform an ablation study to investigate the impact of the observed +motion history on prediction performance. To evaluate the scalability of each +approach when confronted with varying amounts of agents, the inference time of +each model is measured. Following a quantitative analysis, the resulting +predictions are compared in a qualitative manner, giving insight into the +strengths and weaknesses of current approaches. The results demonstrate that +although a constant velocity model (CVM) provides a good approximation of the +overall dynamics in the majority of cases, additional features need to be +incorporated to reflect common pedestrian behavior observed. Therefore, this +study presents a data-driven analysis with the intent to guide the future +development of pedestrian trajectory prediction algorithms. + +
+
+ comment: Accepted in IEEE Transactions on Intelligent Transportation Systems + (T-ITS); 11 pages, 6 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Adaptive Line-Of-Sight guidance law based on vector fields path + following for underactuated unmanned surface vehicle + + +
+ The focus of this paper is to develop a methodology that enables an unmanned +surface vehicle (USV) to efficiently track a planned path. The introduction of +a vector field-based adaptive line of-sight guidance law (VFALOS) for accurate +trajectory tracking and minimizing the overshoot response time during USV +tracking of curved paths improves the overall line-of-sight (LOS) guidance +method. These improvements contribute to faster convergence to the desired +path, reduce oscillations, and can mitigate the effects of persistent external +disturbances. It is shown that the proposed guidance law exhibits k-exponential +stability when converging to the desired path consisting of straight and curved +lines. The results in the paper show that the proposed method effectively +improves the accuracy of the USV tracking the desired path while ensuring the +safety of the USV work. + +
+
+
+
+
+ + ♻ ☆ A Survey of Optimization-based Task and Motion Planning: From Classical + To Learning Approaches + + +
+ Task and Motion Planning (TAMP) integrates high-level task planning and +low-level motion planning to equip robots with the autonomy to effectively +reason over long-horizon, dynamic tasks. Optimization-based TAMP focuses on +hybrid optimization approaches that define goal conditions via objective +functions and are capable of handling open-ended goals, robotic dynamics, and +physical interaction between the robot and the environment. Therefore, +optimization-based TAMP is particularly suited to solve highly complex, +contact-rich locomotion and manipulation problems. This survey provides a +comprehensive review on optimization-based TAMP, covering (i) planning domain +representations, including action description languages and temporal logic, +(ii) individual solution strategies for components of TAMP, including AI +planning and trajectory optimization (TO), and (iii) the dynamic interplay +between logic-based task planning and model-based TO. A particular focus of +this survey is to highlight the algorithm structures to efficiently solve TAMP, +especially hierarchical and distributed approaches. Additionally, the survey +emphasizes the synergy between the classical methods and contemporary +learning-based innovations such as large language models. Furthermore, the +future research directions for TAMP is discussed in this survey, highlighting +both algorithmic and application-specific challenges. + +
+
+ comment: 24 pages, 12 figures, submitted for review +
+
+
+
+
+ + ♻ ☆ DDM-Lag : A Diffusion-based Decision-making Model for Autonomous + Vehicles with Lagrangian Safety Enhancement + + +
+ Decision-making stands as a pivotal component in the realm of autonomous +vehicles (AVs), playing a crucial role in navigating the intricacies of +autonomous driving. Amidst the evolving landscape of data-driven methodologies, +enhancing decision-making performance in complex scenarios has emerged as a +prominent research focus. Despite considerable advancements, current +learning-based decision-making approaches exhibit potential for refinement, +particularly in aspects of policy articulation and safety assurance. To address +these challenges, we introduce DDM-Lag, a Diffusion Decision Model, augmented +with Lagrangian-based safety enhancements. This work conceptualizes the +sequential decision-making challenge inherent in autonomous driving as a +problem of generative modeling, adopting diffusion models as the medium for +assimilating patterns of decision-making. We introduce a hybrid policy update +strategy for diffusion models, amalgamating the principles of behavior cloning +and Q-learning, alongside the formulation of an Actor-Critic architecture for +the facilitation of updates. To augment the model's exploration process with a +layer of safety, we incorporate additional safety constraints, employing a +sophisticated policy optimization technique predicated on Lagrangian relaxation +to refine the policy learning endeavor comprehensively. Empirical evaluation of +our proposed decision-making methodology was conducted across a spectrum of +driving tasks, distinguished by their varying degrees of complexity and +environmental contexts. The comparative analysis with established baseline +methodologies elucidates our model's superior performance, particularly in +dimensions of safety and holistic efficacy. + +
+
+
+
+
+ + ♻ Improving Autonomous Driving Safety with POP: A Framework for Accurate + Partially Observed Trajectory Predictions + + +
+ Accurate trajectory prediction is crucial for safe and efficient autonomous +driving, but handling partial observations presents significant challenges. To +address this, we propose a novel trajectory prediction framework called Partial +Observations Prediction (POP) for congested urban road scenarios. The framework +consists of two key stages: self-supervised learning (SSL) and feature +distillation. POP first employs SLL to help the model learn to reconstruct +history representations, and then utilizes feature distillation as the +fine-tuning task to transfer knowledge from the teacher model, which has been +pre-trained with complete observations, to the student model, which has only +few observations. POP achieves comparable results to top-performing methods in +open-loop experiments and outperforms the baseline method in closed-loop +simulations, including safety metrics. Qualitative results illustrate the +superiority of POP in providing reasonable and safe trajectory predictions. + +
+
+
+
+
+ + ♻ ☆ PROGrasp: Pragmatic Human-Robot Communication for Object Grasping ICRA 2024 + + +
+ Interactive Object Grasping (IOG) is the task of identifying and grasping the +desired object via human-robot natural language interaction. Current IOG +systems assume that a human user initially specifies the target object's +category (e.g., bottle). Inspired by pragmatics, where humans often convey +their intentions by relying on context to achieve goals, we introduce a new IOG +task, Pragmatic-IOG, and the corresponding dataset, Intention-oriented +Multi-modal Dialogue (IM-Dial). In our proposed task scenario, an +intention-oriented utterance (e.g., "I am thirsty") is initially given to the +robot. The robot should then identify the target object by interacting with a +human user. Based on the task setup, we propose a new robotic system that can +interpret the user's intention and pick up the target object, Pragmatic Object +Grasping (PROGrasp). PROGrasp performs Pragmatic-IOG by incorporating modules +for visual grounding, question asking, object grasping, and most importantly, +answer interpretation for pragmatic inference. Experimental results show that +PROGrasp is effective in offline (i.e., target object discovery) and online +(i.e., IOG with a physical robot arm) settings. Code and data are available at +https://github.com/gicheonkang/prograsp. + +
+
+ comment: ICRA 2024 +
+
+
+
+
+ + ♻ ☆ WaterVG: Waterway Visual Grounding based on Text-Guided Vision and + mmWave Radar + + +
+ The perception of waterways based on human intent is significant for +autonomous navigation and operations of Unmanned Surface Vehicles (USVs) in +water environments. Inspired by visual grounding, we introduce WaterVG, the +first visual grounding dataset designed for USV-based waterway perception based +on human prompts. WaterVG encompasses prompts describing multiple targets, with +annotations at the instance level including bounding boxes and masks. Notably, +WaterVG includes 11,568 samples with 34,987 referred targets, whose prompts +integrates both visual and radar characteristics. The pattern of text-guided +two sensors equips a finer granularity of text prompts with visual and radar +features of referred targets. Moreover, we propose a low-power visual grounding +model, Potamoi, which is a multi-task model with a well-designed Phased +Heterogeneous Modality Fusion (PHMF) mode, including Adaptive Radar Weighting +(ARW) and Multi-Head Slim Cross Attention (MHSCA). Exactly, ARW extracts +required radar features to fuse with vision for prompt alignment. MHSCA is an +efficient fusion module with a remarkably small parameter count and FLOPs, +elegantly fusing scenario context captured by two sensors with linguistic +features, which performs expressively on visual grounding tasks. Comprehensive +experiments and evaluations have been conducted on WaterVG, where our Potamoi +archives state-of-the-art performances compared with counterparts. + +
+
+ comment: 10 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ How Can Large Language Models Enable Better Socially Assistive + Human-Robot Interaction: A Brief Survey + + +
+ Socially assistive robots (SARs) have shown great success in providing +personalized cognitive-affective support for user populations with special +needs such as older adults, children with autism spectrum disorder (ASD), and +individuals with mental health challenges. The large body of work on SAR +demonstrates its potential to provide at-home support that complements +clinic-based interventions delivered by mental health professionals, making +these interventions more effective and accessible. However, there are still +several major technical challenges that hinder SAR-mediated interactions and +interventions from reaching human-level social intelligence and efficacy. With +the recent advances in large language models (LLMs), there is an increased +potential for novel applications within the field of SAR that can significantly +expand the current capabilities of SARs. However, incorporating LLMs introduces +new risks and ethical concerns that have not yet been encountered, and must be +carefully be addressed to safely deploy these more advanced systems. In this +work, we aim to conduct a brief survey on the use of LLMs in SAR technologies, +and discuss the potentials and risks of applying LLMs to the following three +major technical challenges of SAR: 1) natural language dialog; 2) multimodal +understanding; 3) LLMs as robot policies. + +
+
+ comment: 2 pages, accepted to the Proceedings of the AAAI Symposium Series, + 2024 +
+
+
+
+
+ + ♻ ☆ Caching-Augmented Lifelong Multi-Agent Path Finding + + +
+ Multi-Agent Path Finding (MAPF), which involves finding collision-free paths +for multiple robots, is crucial in various applications. Lifelong MAPF, where +targets are reassigned to agents as soon as they complete their initial +targets, offers a more accurate approximation of real-world warehouse planning. +In this paper, we present a novel mechanism named Caching-Augmented Lifelong +MAPF (CAL-MAPF), designed to improve the performance of Lifelong MAPF. We have +developed a new type of map grid called cache for temporary item storage and +replacement, and created a locking mechanism to improve the planning solution's +stability. A task assigner (TA) is designed for CAL-MAPF to allocate target +locations to agents and control agent status in different situations. CAL-MAPF +has been evaluated using various cache replacement policies and input task +distributions. We have identified three main factors significantly impacting +CAL-MAPF performance through experimentation: suitable input task distribution, +high cache hit rate, and smooth traffic. In general, CAL-MAPF has demonstrated +potential for performance improvements in certain task distributions, map and +agent configurations. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 92 + +
+
+
+ + ☆ Sigma: Siamese Mamba Network for Multi-Modal Semantic Segmentation + + +
+ Multi-modal semantic segmentation significantly enhances AI agents' +perception and scene understanding, especially under adverse conditions like +low-light or overexposed environments. Leveraging additional modalities +(X-modality) like thermal and depth alongside traditional RGB provides +complementary information, enabling more robust and reliable segmentation. In +this work, we introduce Sigma, a Siamese Mamba network for multi-modal semantic +segmentation, utilizing the Selective Structured State Space Model, Mamba. +Unlike conventional methods that rely on CNNs, with their limited local +receptive fields, or Vision Transformers (ViTs), which offer global receptive +fields at the cost of quadratic complexity, our model achieves global receptive +fields coverage with linear complexity. By employing a Siamese encoder and +innovating a Mamba fusion mechanism, we effectively select essential +information from different modalities. A decoder is then developed to enhance +the channel-wise modeling ability of the model. Our method, Sigma, is +rigorously evaluated on both RGB-Thermal and RGB-Depth segmentation tasks, +demonstrating its superiority and marking the first successful application of +State Space Models (SSMs) in multi-modal perception tasks. Code is available at +https://github.com/zifuwan/Sigma. + +
+
+
+
+
+ + ☆ Watermark-based Detection and Attribution of AI-Generated Content + + +
+ Several companies--such as Google, Microsoft, and OpenAI--have deployed +techniques to watermark AI-generated content to enable proactive detection. +However, existing literature mainly focuses on user-agnostic detection. +Attribution aims to further trace back the user of a generative-AI service who +generated a given content detected as AI-generated. Despite its growing +importance, attribution is largely unexplored. In this work, we aim to bridge +this gap by providing the first systematic study on watermark-based, user-aware +detection and attribution of AI-generated content. Specifically, we +theoretically study the detection and attribution performance via rigorous +probabilistic analysis. Moreover, we develop an efficient algorithm to select +watermarks for the users to enhance attribution performance. Both our +theoretical and empirical results show that watermark-based detection and +attribution inherit the accuracy and (non-)robustness properties of the +watermarking method. + +
+
+
+
+
+ + ☆ Who Evaluates the Evaluations? Objectively Scoring Text-to-Image Prompt + Coherence Metrics with T2IScoreScore (TS2) + + +
+ With advances in the quality of text-to-image (T2I) models has come interest +in benchmarking their prompt faithfulness-the semantic coherence of generated +images to the prompts they were conditioned on. A variety of T2I faithfulness +metrics have been proposed, leveraging advances in cross-modal embeddings and +vision-language models (VLMs). However, these metrics are not rigorously +compared and benchmarked, instead presented against few weak baselines by +correlation to human Likert scores over a set of easy-to-discriminate images. + We introduce T2IScoreScore (TS2), a curated set of semantic error graphs +containing a prompt and a set increasingly erroneous images. These allow us to +rigorously judge whether a given prompt faithfulness metric can correctly order +images with respect to their objective error count and significantly +discriminate between different error nodes, using meta-metric scores derived +from established statistical tests. Surprisingly, we find that the +state-of-the-art VLM-based metrics (e.g., TIFA, DSG, LLMScore, VIEScore) we +tested fail to significantly outperform simple feature-based metrics like +CLIPScore, particularly on a hard subset of naturally-occurring T2I model +errors. TS2 will enable the development of better T2I prompt faithfulness +metrics through more rigorous comparison of their conformity to expected +orderings and separations under objective criteria. + +
+
+ comment: 15 pages main, 9 pages appendices, 16 figures, 3 tables +
+
+
+
+
+ + ☆ Evaluating Adversarial Robustness: A Comparison Of FGSM, Carlini-Wagner + Attacks, And The Role of Distillation as Defense Mechanism + + +
+ This technical report delves into an in-depth exploration of adversarial +attacks specifically targeted at Deep Neural Networks (DNNs) utilized for image +classification. The study also investigates defense mechanisms aimed at +bolstering the robustness of machine learning models. The research focuses on +comprehending the ramifications of two prominent attack methodologies: the Fast +Gradient Sign Method (FGSM) and the Carlini-Wagner (CW) approach. These attacks +are examined concerning three pre-trained image classifiers: Resnext50_32x4d, +DenseNet-201, and VGG-19, utilizing the Tiny-ImageNet dataset. Furthermore, the +study proposes the robustness of defensive distillation as a defense mechanism +to counter FGSM and CW attacks. This defense mechanism is evaluated using the +CIFAR-10 dataset, where CNN models, specifically resnet101 and Resnext50_32x4d, +serve as the teacher and student models, respectively. The proposed defensive +distillation model exhibits effectiveness in thwarting attacks such as FGSM. +However, it is noted to remain susceptible to more sophisticated techniques +like the CW attack. The document presents a meticulous validation of the +proposed scheme. It provides detailed and comprehensive results, elucidating +the efficacy and limitations of the defense mechanisms employed. Through +rigorous experimentation and analysis, the study offers insights into the +dynamics of adversarial attacks on DNNs, as well as the effectiveness of +defensive strategies in mitigating their impact. + +
+
+ comment: This report pertains to the Capstone Project done by Group 1 of the + Fall batch of 2023 students at Praxis Tech School, Kolkata, India. The + reports consists of 35 pages and it includes 15 figures and 10 tables. This + is the preprint which will be submitted to to an IEEE international + conference for review +
+
+
+
+
+ + ☆ DiffOp-net: A Differential Operator-based Fully Convolutional Network + for Unsupervised Deformable Image Registration + + +
+ Existing unsupervised deformable image registration methods usually rely on +metrics applied to the gradients of predicted displacement or velocity fields +as a regularization term to ensure transformation smoothness, which potentially +limits registration accuracy. In this study, we propose a novel approach to +enhance unsupervised deformable image registration by introducing a new +differential operator into the registration framework. This operator, acting on +the velocity field and mapping it to a dual space, ensures the smoothness of +the velocity field during optimization, facilitating accurate deformable +registration. In addition, to tackle the challenge of capturing large +deformations inside image pairs, we introduce a Cross-Coordinate Attention +module (CCA) and embed it into a proposed Fully Convolutional Networks +(FCNs)-based multi-resolution registration architecture. Evaluation experiments +are conducted on two magnetic resonance imaging (MRI) datasets. Compared to +various state-of-the-art registration approaches, including a traditional +algorithm and three representative unsupervised learning-based methods, our +method achieves superior accuracies, maintaining desirable diffeomorphic +properties, and exhibiting promising registration speed. + +
+
+
+
+
+ + ☆ Identity Decoupling for Multi-Subject Personalization of Text-to-Image + Models + + +
+ Text-to-image diffusion models have shown remarkable success in generating a +personalized subject based on a few reference images. However, current methods +struggle with handling multiple subjects simultaneously, often resulting in +mixed identities with combined attributes from different subjects. In this +work, we present MuDI, a novel framework that enables multi-subject +personalization by effectively decoupling identities from multiple subjects. +Our main idea is to utilize segmented subjects generated by the Segment +Anything Model for both training and inference, as a form of data augmentation +for training and initialization for the generation process. Our experiments +demonstrate that MuDI can produce high-quality personalized images without +identity mixing, even for highly similar subjects as shown in Figure 1. In +human evaluation, MuDI shows twice as many successes for personalizing multiple +subjects without identity mixing over existing baselines and is preferred over +70% compared to the strongest baseline. More results are available at +https://mudi-t2i.github.io/. + +
+
+ comment: Preprint. Project page: https://mudi-t2i.github.io/ +
+
+
+
+
+ + ☆ Physical Property Understanding from Language-Embedded Feature Fields CVPR 2024 + + +
+ Can computers perceive the physical properties of objects solely through +vision? Research in cognitive science and vision science has shown that humans +excel at identifying materials and estimating their physical properties based +purely on visual appearance. In this paper, we present a novel approach for +dense prediction of the physical properties of objects using a collection of +images. Inspired by how humans reason about physics through vision, we leverage +large language models to propose candidate materials for each object. We then +construct a language-embedded point cloud and estimate the physical properties +of each 3D point using a zero-shot kernel regression approach. Our method is +accurate, annotation-free, and applicable to any object in the open world. +Experiments demonstrate the effectiveness of the proposed approach in various +physical property reasoning tasks, such as estimating the mass of common +objects, as well as other properties like friction and hardness. + +
+
+ comment: CVPR 2024. Project page (with code): + https://ajzhai.github.io/NeRF2Physics/ +
+
+
+
+
+ + ☆ Image-Text Co-Decomposition for Text-Supervised Semantic Segmentation CVPR 2024 + + +
+ This paper addresses text-supervised semantic segmentation, aiming to learn a +model capable of segmenting arbitrary visual concepts within images by using +only image-text pairs without dense annotations. Existing methods have +demonstrated that contrastive learning on image-text pairs effectively aligns +visual segments with the meanings of texts. We notice that there is a +discrepancy between text alignment and semantic segmentation: A text often +consists of multiple semantic concepts, whereas semantic segmentation strives +to create semantically homogeneous segments. To address this issue, we propose +a novel framework, Image-Text Co-Decomposition (CoDe), where the paired image +and text are jointly decomposed into a set of image regions and a set of word +segments, respectively, and contrastive learning is developed to enforce +region-word alignment. To work with a vision-language model, we present a +prompt learning mechanism that derives an extra representation to highlight an +image segment or a word segment of interest, with which more effective features +can be extracted from that segment. Comprehensive experimental results +demonstrate that our method performs favorably against existing text-supervised +semantic segmentation methods on six benchmark datasets. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Robust Gaussian Splatting + + +
+ In this paper, we address common error sources for 3D Gaussian Splatting +(3DGS) including blur, imperfect camera poses, and color inconsistencies, with +the goal of improving its robustness for practical applications like +reconstructions from handheld phone captures. Our main contribution involves +modeling motion blur as a Gaussian distribution over camera poses, allowing us +to address both camera pose refinement and motion blur correction in a unified +way. Additionally, we propose mechanisms for defocus blur compensation and for +addressing color in-consistencies caused by ambient light, shadows, or due to +camera-related factors like varying white balancing settings. Our proposed +solutions integrate in a seamless way with the 3DGS formulation while +maintaining its benefits in terms of training efficiency and rendering speed. +We experimentally validate our contributions on relevant benchmark datasets +including Scannet++ and Deblur-NeRF, obtaining state-of-the-art results and +thus consistent improvements over relevant baselines. + +
+
+
+
+
+ + ☆ Deep-learning Segmentation of Small Volumes in CT images for + Radiotherapy Treatment Planning + + +
+ Our understanding of organs at risk is progressing to include physical small +tissues such as coronary arteries and the radiosensitivities of many small +organs and tissues are high. Therefore, the accurate segmentation of small +volumes in external radiotherapy is crucial to protect them from +over-irradiation. Moreover, with the development of the particle therapy and +on-board imaging, the treatment becomes more accurate and precise. The purpose +of this work is to optimize organ segmentation algorithms for small organs. We +used 50 three-dimensional (3-D) computed tomography (CT) head and neck images +from StructSeg2019 challenge to develop a general-purpose V-Net model to +segment 20 organs in the head and neck region. We applied specific strategies +to improve the segmentation accuracy of the small volumes in this anatomical +region, i.e., the lens of the eye. Then, we used 17 additional head images from +OSF healthcare to validate the robustness of the V Net model optimized for +small-volume segmentation. With the study of the StructSeg2019 images, we found +that the optimization of the image normalization range and classification +threshold yielded a segmentation improvement of the lens of the eye of +approximately 50%, compared to the use of the V-Net not optimized for small +volumes. We used the optimized model to segment 17 images acquired using +heterogeneous protocols. We obtained comparable Dice coefficient values for the +clinical and StructSeg2019 images (0.61 plus/minus 0.07 and 0.58 plus/minus +0.10 for the left and right lens of the eye, respectively) + +
+
+
+
+
+ + ☆ SCAResNet: A ResNet Variant Optimized for Tiny Object Detection in + Transmission and Distribution Towers + + +
+ Traditional deep learning-based object detection networks often resize images +during the data preprocessing stage to achieve a uniform size and scale in the +feature map. Resizing is done to facilitate model propagation and fully +connected classification. However, resizing inevitably leads to object +deformation and loss of valuable information in the images. This drawback +becomes particularly pronounced for tiny objects like distribution towers with +linear shapes and few pixels. To address this issue, we propose abandoning the +resizing operation. Instead, we introduce Positional-Encoding Multi-head +Criss-Cross Attention. This allows the model to capture contextual information +and learn from multiple representation subspaces, effectively enriching the +semantics of distribution towers. Additionally, we enhance Spatial Pyramid +Pooling by reshaping three pooled feature maps into a new unified one while +also reducing the computational burden. This approach allows images of +different sizes and scales to generate feature maps with uniform dimensions and +can be employed in feature map propagation. Our SCAResNet incorporates these +aforementioned improvements into the backbone network ResNet. We evaluated our +SCAResNet using the Electric Transmission and Distribution Infrastructure +Imagery dataset from Duke University. Without any additional tricks, we +employed various object detection models with Gaussian Receptive Field based +Label Assignment as the baseline. When incorporating the SCAResNet into the +baseline model, we achieved a 2.1% improvement in mAPs. This demonstrates the +advantages of our SCAResNet in detecting transmission and distribution towers +and its value in tiny object detection. The source code is available at +https://github.com/LisavilaLee/SCAResNet_mmdet. + +
+
+
+
+
+ + ☆ Noisy Label Processing for Classification: A Survey + + +
+ In recent years, deep neural networks (DNNs) have gained remarkable +achievement in computer vision tasks, and the success of DNNs often depends +greatly on the richness of data. However, the acquisition process of data and +high-quality ground truth requires a lot of manpower and money. In the long, +tedious process of data annotation, annotators are prone to make mistakes, +resulting in incorrect labels of images, i.e., noisy labels. The emergence of +noisy labels is inevitable. Moreover, since research shows that DNNs can easily +fit noisy labels, the existence of noisy labels will cause significant damage +to the model training process. Therefore, it is crucial to combat noisy labels +for computer vision tasks, especially for classification tasks. In this survey, +we first comprehensively review the evolution of different deep learning +approaches for noisy label combating in the image classification task. In +addition, we also review different noise patterns that have been proposed to +design robust algorithms. Furthermore, we explore the inner pattern of +real-world label noise and propose an algorithm to generate a synthetic label +noise pattern guided by real-world data. We test the algorithm on the +well-known real-world dataset CIFAR-10N to form a new real-world data-guided +synthetic benchmark and evaluate some typical noise-robust methods on the +benchmark. + +
+
+
+
+
+ + ☆ MarsSeg: Mars Surface Semantic Segmentation with Multi-level Extractor + and Connector + + +
+ The segmentation and interpretation of the Martian surface play a pivotal +role in Mars exploration, providing essential data for the trajectory planning +and obstacle avoidance of rovers. However, the complex topography, similar +surface features, and the lack of extensive annotated data pose significant +challenges to the high-precision semantic segmentation of the Martian surface. +To address these challenges, we propose a novel encoder-decoder based Mars +segmentation network, termed MarsSeg. Specifically, we employ an +encoder-decoder structure with a minimized number of down-sampling layers to +preserve local details. To facilitate a high-level semantic understanding +across the shadow multi-level feature maps, we introduce a feature enhancement +connection layer situated between the encoder and decoder. This layer +incorporates Mini Atrous Spatial Pyramid Pooling (Mini-ASPP), Polarized +Self-Attention (PSA), and Strip Pyramid Pooling Module (SPPM). The Mini-ASPP +and PSA are specifically designed for shadow feature enhancement, thereby +enabling the expression of local details and small objects. Conversely, the +SPPM is employed for deep feature enhancement, facilitating the extraction of +high-level semantic category-related information. Experimental results derived +from the Mars-Seg and AI4Mars datasets substantiate that the proposed MarsSeg +outperforms other state-of-the-art methods in segmentation performance, +validating the efficacy of each proposed component. + +
+
+
+
+
+ + ☆ Improving Detection in Aerial Images by Capturing Inter-Object + Relationships + + +
+ In many image domains, the spatial distribution of objects in a scene +exhibits meaningful patterns governed by their semantic relationships. In most +modern detection pipelines, however, the detection proposals are processed +independently, overlooking the underlying relationships between objects. In +this work, we introduce a transformer-based approach to capture these +inter-object relationships to refine classification and regression outcomes for +detected objects. Building on two-stage detectors, we tokenize the region of +interest (RoI) proposals to be processed by a transformer encoder. Specific +spatial and geometric relations are incorporated into the attention weights and +adaptively modulated and regularized. Experimental results demonstrate that the +proposed method achieves consistent performance improvement on three benchmarks +including DOTA-v1.0, DOTA-v1.5, and HRSC 2016, especially ranking first on both +DOTA-v1.5 and HRSC 2016. Specifically, our new method has an increase of 1.59 +mAP on DOTA-v1.0, 4.88 mAP on DOTA-v1.5, and 2.1 mAP on HRSC 2016, +respectively, compared to the baselines. + +
+
+
+
+
+ + ☆ 3D Facial Expressions through Analysis-by-Neural-Synthesis + + +
+ While existing methods for 3D face reconstruction from in-the-wild images +excel at recovering the overall face shape, they commonly miss subtle, extreme, +asymmetric, or rarely observed expressions. We improve upon these methods with +SMIRK (Spatial Modeling for Image-based Reconstruction of Kinesics), which +faithfully reconstructs expressive 3D faces from images. We identify two key +limitations in existing methods: shortcomings in their self-supervised training +formulation, and a lack of expression diversity in the training images. For +training, most methods employ differentiable rendering to compare a predicted +face mesh with the input image, along with a plethora of additional loss +functions. This differentiable rendering loss not only has to provide +supervision to optimize for 3D face geometry, camera, albedo, and lighting, +which is an ill-posed optimization problem, but the domain gap between +rendering and input image further hinders the learning process. Instead, SMIRK +replaces the differentiable rendering with a neural rendering module that, +given the rendered predicted mesh geometry, and sparsely sampled pixels of the +input image, generates a face image. As the neural rendering gets color +information from sampled image pixels, supervising with neural rendering-based +reconstruction loss can focus solely on the geometry. Further, it enables us to +generate images of the input identity with varying expressions while training. +These are then utilized as input to the reconstruction model and used as +supervision with ground truth geometry. This effectively augments the training +data and enhances the generalization for diverse expressions. Our qualitative, +quantitative and particularly our perceptual evaluations demonstrate that SMIRK +achieves the new state-of-the art performance on accurate expression +reconstruction. Project webpage: https://georgeretsi.github.io/smirk/. + +
+
+
+
+
+ + ☆ Dynamic Prompt Optimizing for Text-to-Image Generation CVPR 2024 + + +
+ Text-to-image generative models, specifically those based on diffusion models +like Imagen and Stable Diffusion, have made substantial advancements. Recently, +there has been a surge of interest in the delicate refinement of text prompts. +Users assign weights or alter the injection time steps of certain words in the +text prompts to improve the quality of generated images. However, the success +of fine-control prompts depends on the accuracy of the text prompts and the +careful selection of weights and time steps, which requires significant manual +intervention. To address this, we introduce the \textbf{P}rompt +\textbf{A}uto-\textbf{E}diting (PAE) method. Besides refining the original +prompts for image generation, we further employ an online reinforcement +learning strategy to explore the weights and injection time steps of each word, +leading to the dynamic fine-control prompts. The reward function during +training encourages the model to consider aesthetic score, semantic +consistency, and user preferences. Experimental results demonstrate that our +proposed method effectively improves the original prompts, generating visually +more appealing images while maintaining semantic alignment. Code is available +at https://github.com/Mowenyii/PAE. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Label Propagation for Zero-shot Classification with Vision-Language + Models CVPR 2024 + + +
+ Vision-Language Models (VLMs) have demonstrated impressive performance on +zero-shot classification, i.e. classification when provided merely with a list +of class names. In this paper, we tackle the case of zero-shot classification +in the presence of unlabeled data. We leverage the graph structure of the +unlabeled data and introduce ZLaP, a method based on label propagation (LP) +that utilizes geodesic distances for classification. We tailor LP to graphs +containing both text and image features and further propose an efficient method +for performing inductive inference based on a dual solution and a +sparsification step. We perform extensive experiments to evaluate the +effectiveness of our method on 14 common datasets and show that ZLaP +outperforms the latest related works. Code: +https://github.com/vladan-stojnic/ZLaP + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Score identity Distillation: Exponentially Fast Distillation of + Pretrained Diffusion Models for One-Step Generation + + +
+ We introduce Score identity Distillation (SiD), an innovative data-free +method that distills the generative capabilities of pretrained diffusion models +into a single-step generator. SiD not only facilitates an exponentially fast +reduction in Fr\'echet inception distance (FID) during distillation but also +approaches or even exceeds the FID performance of the original teacher +diffusion models. By reformulating forward diffusion processes as semi-implicit +distributions, we leverage three score-related identities to create an +innovative loss mechanism. This mechanism achieves rapid FID reduction by +training the generator using its own synthesized images, eliminating the need +for real data or reverse-diffusion-based generation, all accomplished within +significantly shortened generation time. Upon evaluation across four benchmark +datasets, the SiD algorithm demonstrates high iteration efficiency during +distillation and surpasses competing distillation approaches, whether they are +one-step or few-step, data-free, or dependent on training data, in terms of +generation quality. This achievement not only redefines the benchmarks for +efficiency and effectiveness in diffusion distillation but also in the broader +field of diffusion-based generation. Our PyTorch implementation will be +publicly accessible on GitHub. + +
+
+
+
+
+ + No Time to Train: Empowering Non-Parametric Networks for Few-shot 3D + Scene Segmentation CVPR + + +
+ To reduce the reliance on large-scale datasets, recent works in 3D +segmentation resort to few-shot learning. Current 3D few-shot segmentation +methods first pre-train models on 'seen' classes, and then evaluate their +generalization performance on 'unseen' classes. However, the prior pre-training +stage not only introduces excessive time overhead but also incurs a significant +domain gap on 'unseen' classes. To tackle these issues, we propose a +Non-parametric Network for few-shot 3D Segmentation, Seg-NN, and its Parametric +variant, Seg-PN. Without training, Seg-NN extracts dense representations by +hand-crafted filters and achieves comparable performance to existing parametric +models. Due to the elimination of pre-training, Seg-NN can alleviate the domain +gap issue and save a substantial amount of time. Based on Seg-NN, Seg-PN only +requires training a lightweight QUEry-Support Transferring (QUEST) module, +which enhances the interaction between the support set and query set. +Experiments suggest that Seg-PN outperforms previous state-of-the-art method by ++4.19% and +7.71% mIoU on S3DIS and ScanNet datasets respectively, while +reducing training time by -90%, indicating its effectiveness and efficiency. + +
+
+ comment: CVPR Highlight. Code is available at + https://github.com/yangyangyang127/Seg-NN. arXiv admin note: text overlap + with arXiv:2308.12961 +
+
+
+
+
+ + ☆ Dynamic Risk Assessment Methodology with an LDM-based System for Parking + Scenarios + + +
+ This paper describes the methodology for building a dynamic risk assessment +for ADAS (Advanced Driving Assistance Systems) algorithms in parking scenarios, +fusing exterior and interior perception for a better understanding of the scene +and a more comprehensive risk estimation. This includes the definition of a +dynamic risk methodology that depends on the situation from inside and outside +the vehicle, the creation of a multi-sensor dataset of risk assessment for ADAS +benchmarking purposes, and a Local Dynamic Map (LDM) that fuses data from the +exterior and interior of the car to build an LDM-based Dynamic Risk Assessment +System (DRAS). + +
+
+
+
+
+ + ☆ InstructHumans: Editing Animated 3D Human Textures with Instructions + + +
+ We present InstructHumans, a novel framework for instruction-driven 3D human +texture editing. Existing text-based editing methods use Score Distillation +Sampling (SDS) to distill guidance from generative models. This work shows that +naively using such scores is harmful to editing as they destroy consistency +with the source avatar. Instead, we propose an alternate SDS for Editing +(SDS-E) that selectively incorporates subterms of SDS across diffusion +timesteps. We further enhance SDS-E with spatial smoothness regularization and +gradient-based viewpoint sampling to achieve high-quality edits with sharp and +high-fidelity detailing. InstructHumans significantly outperforms existing 3D +editing methods, consistent with the initial avatar while faithful to the +textual instructions. Project page: https://jyzhu.top/instruct-humans . + +
+
+ comment: Project Page: https://jyzhu.top/instruct-humans +
+
+
+
+
+ + ☆ MM-Gaussian: 3D Gaussian-based Multi-modal Fusion for Localization and + Reconstruction in Unbounded Scenes + + +
+ Localization and mapping are critical tasks for various applications such as +autonomous vehicles and robotics. The challenges posed by outdoor environments +present particular complexities due to their unbounded characteristics. In this +work, we present MM-Gaussian, a LiDAR-camera multi-modal fusion system for +localization and mapping in unbounded scenes. Our approach is inspired by the +recently developed 3D Gaussians, which demonstrate remarkable capabilities in +achieving high rendering quality and fast rendering speed. Specifically, our +system fully utilizes the geometric structure information provided by +solid-state LiDAR to address the problem of inaccurate depth encountered when +relying solely on visual solutions in unbounded, outdoor scenarios. +Additionally, we utilize 3D Gaussian point clouds, with the assistance of +pixel-level gradient descent, to fully exploit the color information in photos, +thereby achieving realistic rendering effects. To further bolster the +robustness of our system, we designed a relocalization module, which assists in +returning to the correct trajectory in the event of a localization failure. +Experiments conducted in multiple scenarios demonstrate the effectiveness of +our method. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ Framework to generate perfusion map from CT and CTA images in patients + with acute ischemic stroke: A longitudinal and cross-sectional study + + +
+ Stroke is a leading cause of disability and death. Effective treatment +decisions require early and informative vascular imaging. 4D perfusion imaging +is ideal but rarely available within the first hour after stroke, whereas plain +CT and CTA usually are. Hence, we propose a framework to extract a predicted +perfusion map (PPM) derived from CT and CTA images. In all eighteen patients, +we found significantly high spatial similarity (with average Spearman's +correlation = 0.7893) between our predicted perfusion map (PPM) and the T-max +map derived from 4D-CTP. Voxelwise correlations between the PPM and National +Institutes of Health Stroke Scale (NIHSS) subscores for L/R hand motor, gaze, +and language on a large cohort of 2,110 subjects reliably mapped symptoms to +expected infarct locations. Therefore our PPM could serve as an alternative for +4D perfusion imaging, if the latter is unavailable, to investigate blood +perfusion in the first hours after hospital admission. + +
+
+ comment: Accepted and presented in SWITCH2023: Stroke Workshop on Imaging and + Treatment CHallenges (MICCAI 2023, Vancouver Canada) +
+
+
+
+
+ + ☆ Neural-Symbolic VideoQA: Learning Compositional Spatio-Temporal + Reasoning for Real-world Video Question Answering + + +
+ Compositional spatio-temporal reasoning poses a significant challenge in the +field of video question answering (VideoQA). Existing approaches struggle to +establish effective symbolic reasoning structures, which are crucial for +answering compositional spatio-temporal questions. To address this challenge, +we propose a neural-symbolic framework called Neural-Symbolic VideoQA +(NS-VideoQA), specifically designed for real-world VideoQA tasks. The +uniqueness and superiority of NS-VideoQA are two-fold: 1) It proposes a Scene +Parser Network (SPN) to transform static-dynamic video scenes into Symbolic +Representation (SR), structuralizing persons, objects, relations, and action +chronologies. 2) A Symbolic Reasoning Machine (SRM) is designed for top-down +question decompositions and bottom-up compositional reasonings. Specifically, a +polymorphic program executor is constructed for internally consistent reasoning +from SR to the final answer. As a result, Our NS-VideoQA not only improves the +compositional spatio-temporal reasoning in real-world VideoQA task, but also +enables step-by-step error analysis by tracing the intermediate results. +Experimental evaluations on the AGQA Decomp benchmark demonstrate the +effectiveness of the proposed NS-VideoQA framework. Empirical studies further +confirm that NS-VideoQA exhibits internal consistency in answering +compositional questions and significantly improves the capability of +spatio-temporal and logical inference for VideoQA tasks. + +
+
+
+
+
+ + ☆ Finsler-Laplace-Beltrami Operators with Application to Shape Analysis + + +
+ The Laplace-Beltrami operator (LBO) emerges from studying manifolds equipped +with a Riemannian metric. It is often called the Swiss army knife of geometry +processing as it allows to capture intrinsic shape information and gives rise +to heat diffusion, geodesic distances, and a multitude of shape descriptors. It +also plays a central role in geometric deep learning. In this work, we explore +Finsler manifolds as a generalization of Riemannian manifolds. We revisit the +Finsler heat equation and derive a Finsler heat kernel and a +Finsler-Laplace-Beltrami Operator (FLBO): a novel theoretically justified +anisotropic Laplace-Beltrami operator (ALBO). In experimental evaluations we +demonstrate that the proposed FLBO is a valuable alternative to the traditional +Riemannian-based LBO and ALBOs for spatial filtering and shape correspondence +estimation. We hope that the proposed Finsler heat kernel and the FLBO will +inspire further exploration of Finsler geometry in the computer vision +community. + +
+
+
+
+
+ + ☆ Physics-Inspired Synthesized Underwater Image Dataset + + +
+ This paper introduces the physics-inspired synthesized underwater image +dataset (PHISWID), a dataset tailored for enhancing underwater image processing +through physics-inspired image synthesis. Deep learning approaches to +underwater image enhancement typically demand extensive datasets, yet acquiring +paired clean and degraded underwater ones poses significant challenges. While +several underwater image datasets have been proposed using physics-based +synthesis, a publicly accessible collection has been lacking. Additionally, +most underwater image synthesis approaches do not intend to reproduce +atmospheric scenes, resulting in incomplete enhancement. PHISWID addresses this +gap by offering a set of paired ground-truth (atmospheric) and synthetically +degraded underwater images, showcasing not only color degradation but also the +often-neglected effects of marine snow, a composite of organic matter and sand +particles that considerably impairs underwater image clarity. The dataset +applies these degradations to atmospheric RGB-D images, enhancing the dataset's +realism and applicability. PHISWID is particularly valuable for training deep +neural networks in a supervised learning setting and for objectively assessing +image quality in benchmark analyses. Our results reveal that even a basic U-Net +architecture, when trained with PHISWID, substantially outperforms existing +methods in underwater image enhancement. We intend to release PHISWID publicly, +contributing a significant resource to the advancement of underwater imaging +technology. + +
+
+
+
+
+ + ☆ Rolling the dice for better deep learning performance: A study of + randomness techniques in deep neural networks + + +
+ This paper investigates how various randomization techniques impact Deep +Neural Networks (DNNs). Randomization, like weight noise and dropout, aids in +reducing overfitting and enhancing generalization, but their interactions are +poorly understood. The study categorizes randomness techniques into four types +and proposes new methods: adding noise to the loss function and random masking +of gradient updates. Using Particle Swarm Optimizer (PSO) for hyperparameter +optimization, it explores optimal configurations across MNIST, FASHION-MNIST, +CIFAR10, and CIFAR100 datasets. Over 30,000 configurations are evaluated, +revealing data augmentation and weight initialization randomness as main +performance contributors. Correlation analysis shows different optimizers +prefer distinct randomization types. The complete implementation and dataset +are available on GitHub. + +
+
+
+
+
+ + ☆ Towards Efficient and Accurate CT Segmentation via Edge-Preserving + Probabilistic Downsampling + + +
+ Downsampling images and labels, often necessitated by limited resources or to +expedite network training, leads to the loss of small objects and thin +boundaries. This undermines the segmentation network's capacity to interpret +images accurately and predict detailed labels, resulting in diminished +performance compared to processing at original resolutions. This situation +exemplifies the trade-off between efficiency and accuracy, with higher +downsampling factors further impairing segmentation outcomes. Preserving +information during downsampling is especially critical for medical image +segmentation tasks. To tackle this challenge, we introduce a novel method named +Edge-preserving Probabilistic Downsampling (EPD). It utilizes class uncertainty +within a local window to produce soft labels, with the window size dictating +the downsampling factor. This enables a network to produce quality predictions +at low resolutions. Beyond preserving edge details more effectively than +conventional nearest-neighbor downsampling, employing a similar algorithm for +images, it surpasses bilinear interpolation in image downsampling, enhancing +overall performance. Our method significantly improved Intersection over Union +(IoU) to 2.85%, 8.65%, and 11.89% when downsampling data to 1/2, 1/4, and 1/8, +respectively, compared to conventional interpolation methods. + +
+
+ comment: 5 pages (4 figures, 1 table); This work has been submitted to the + IEEE Signal Processing Letters. Copyright may be transferred without notice, + after which this version may no longer be accessible +
+
+
+
+
+ + ☆ RaSim: A Range-aware High-fidelity RGB-D Data Simulation Pipeline for + Real-world Applications ICRA'24 + + +
+ In robotic vision, a de-facto paradigm is to learn in simulated environments +and then transfer to real-world applications, which poses an essential +challenge in bridging the sim-to-real domain gap. While mainstream works tackle +this problem in the RGB domain, we focus on depth data synthesis and develop a +range-aware RGB-D data simulation pipeline (RaSim). In particular, +high-fidelity depth data is generated by imitating the imaging principle of +real-world sensors. A range-aware rendering strategy is further introduced to +enrich data diversity. Extensive experiments show that models trained with +RaSim can be directly applied to real-world scenarios without any finetuning +and excel at downstream RGB-D perception tasks. + +
+
+ comment: accepted by ICRA'24 +
+
+
+
+
+ + ☆ Deep Learning for Satellite Image Time Series Analysis: A Review + + +
+ Earth observation (EO) satellite missions have been providing detailed images +about the state of the Earth and its land cover for over 50 years. Long term +missions, such as NASA's Landsat, Terra, and Aqua satellites, and more +recently, the ESA's Sentinel missions, record images of the entire world every +few days. Although single images provide point-in-time data, repeated images of +the same area, or satellite image time series (SITS) provide information about +the changing state of vegetation and land use. These SITS are useful for +modeling dynamic processes and seasonal changes such as plant phenology. They +have potential benefits for many aspects of land and natural resource +management, including applications in agricultural, forest, water, and disaster +management, urban planning, and mining. However, the resulting satellite image +time series (SITS) are complex, incorporating information from the temporal, +spatial, and spectral dimensions. Therefore, deep learning methods are often +deployed as they can analyze these complex relationships. This review presents +a summary of the state-of-the-art methods of modelling environmental, +agricultural, and other Earth observation variables from SITS data using deep +learning methods. We aim to provide a resource for remote sensing experts +interested in using deep learning techniques to enhance Earth observation +models with temporal information. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Real-GDSR: Real-World Guided DSM Super-Resolution via Edge-Enhancing + Residual Network + + +
+ A low-resolution digital surface model (DSM) features distinctive attributes +impacted by noise, sensor limitations and data acquisition conditions, which +failed to be replicated using simple interpolation methods like bicubic. This +causes super-resolution models trained on synthetic data does not perform +effectively on real ones. Training a model on real low and high resolution DSMs +pairs is also a challenge because of the lack of information. On the other +hand, the existence of other imaging modalities of the same scene can be used +to enrich the information needed for large-scale super-resolution. In this +work, we introduce a novel methodology to address the intricacies of real-world +DSM super-resolution, named REAL-GDSR, breaking down this ill-posed problem +into two steps. The first step involves the utilization of a residual local +refinement network. This strategic approach departs from conventional methods +that trained to directly predict height values instead of the differences +(residuals) and utilize large receptive fields in their networks. The second +step introduces a diffusion-based technique that enhances the results on a +global scale, with a primary focus on smoothing and edge preservation. Our +experiments underscore the effectiveness of the proposed method. We conduct a +comprehensive evaluation, comparing it to recent state-of-the-art techniques in +the domain of real-world DSM super-resolution (SR). Our approach consistently +outperforms these existing methods, as evidenced through qualitative and +quantitative assessments. + +
+
+ comment: Accepted for publication in the ISPRS Annals of Photogrammetry, + Remote Sensing, and Spatial Information Sciences +
+
+
+
+
+ + ☆ LightOctree: Lightweight 3D Spatially-Coherent Indoor Lighting + Estimation + + +
+ We present a lightweight solution for estimating spatially-coherent indoor +lighting from a single RGB image. Previous methods for estimating illumination +using volumetric representations have overlooked the sparse distribution of +light sources in space, necessitating substantial memory and computational +resources for achieving high-quality results. We introduce a unified, voxel +octree-based illumination estimation framework to produce 3D spatially-coherent +lighting. Additionally, a differentiable voxel octree cone tracing rendering +layer is proposed to eliminate regular volumetric representation throughout the +entire process and ensure the retention of features across different frequency +domains. This reduction significantly decreases spatial usage and required +floating-point operations without substantially compromising precision. +Experimental results demonstrate that our approach achieves high-quality +coherent estimation with minimal cost compared to previous methods. + +
+
+
+
+
+ + ☆ Learning Correlation Structures for Vision Transformers CVPR 2024 + + +
+ We introduce a new attention mechanism, dubbed structural self-attention +(StructSA), that leverages rich correlation patterns naturally emerging in +key-query interactions of attention. StructSA generates attention maps by +recognizing space-time structures of key-query correlations via convolution and +uses them to dynamically aggregate local contexts of value features. This +effectively leverages rich structural patterns in images and videos such as +scene layouts, object motion, and inter-object relations. Using StructSA as a +main building block, we develop the structural vision transformer (StructViT) +and evaluate its effectiveness on both image and video classification tasks, +achieving state-of-the-art results on ImageNet-1K, Kinetics-400, +Something-Something V1 & V2, Diving-48, and FineGym. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Concept Weaver: Enabling Multi-Concept Fusion in Text-to-Image Models CVPR 2024 + + +
+ While there has been significant progress in customizing text-to-image +generation models, generating images that combine multiple personalized +concepts remains challenging. In this work, we introduce Concept Weaver, a +method for composing customized text-to-image diffusion models at inference +time. Specifically, the method breaks the process into two steps: creating a +template image aligned with the semantics of input prompts, and then +personalizing the template using a concept fusion strategy. The fusion strategy +incorporates the appearance of the target concepts into the template image +while retaining its structural details. The results indicate that our method +can generate multiple custom concepts with higher identity fidelity compared to +alternative approaches. Furthermore, the method is shown to seamlessly handle +more than two concepts and closely follow the semantic meaning of the input +prompt without blending appearances across different subjects. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Deep Phase Coded Image Prior + + +
+ Phase-coded imaging is a computational imaging method designed to tackle +tasks such as passive depth estimation and extended depth of field (EDOF) using +depth cues inserted during image capture. Most of the current deep +learning-based methods for depth estimation or all-in-focus imaging require a +training dataset with high-quality depth maps and an optimal focus point at +infinity for all-in-focus images. Such datasets are difficult to create, +usually synthetic, and require external graphic programs. We propose a new +method named "Deep Phase Coded Image Prior" (DPCIP) for jointly recovering the +depth map and all-in-focus image from a coded-phase image using solely the +captured image and the optical information of the imaging system. Our approach +does not depend on any specific dataset and surpasses prior supervised +techniques utilizing the same imaging system. This improvement is achieved +through the utilization of a problem formulation based on implicit neural +representation (INR) and deep image prior (DIP). Due to our zero-shot method, +we overcome the barrier of acquiring accurate ground-truth data of depth maps +and all-in-focus images for each new phase-coded system introduced. This allows +focusing mainly on developing the imaging system, and not on ground-truth data +collection. + +
+
+
+
+
+ + ☆ VoltaVision: A Transfer Learning model for electronic component + classification ICLR 2024 + + +
+ In this paper, we analyze the effectiveness of transfer learning on +classifying electronic components. Transfer learning reuses pre-trained models +to save time and resources in building a robust classifier rather than learning +from scratch. Our work introduces a lightweight CNN, coined as VoltaVision, and +compares its performance against more complex models. We test the hypothesis +that transferring knowledge from a similar task to our target domain yields +better results than state-of-the-art models trained on general datasets. Our +dataset and code for this work are available at +https://github.com/AnasIshfaque/VoltaVision. + +
+
+ comment: Tiny Paper at ICLR 2024 +
+
+
+
+
+ + ☆ Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and + Integration of Convolutional Neural Networks and Explainable AI + + +
+ The study introduces an integrated framework combining Convolutional Neural +Networks (CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced +diagnosis of breast cancer using the CBIS-DDSM dataset. Utilizing a fine-tuned +ResNet50 architecture, our investigation not only provides effective +differentiation of mammographic images into benign and malignant categories but +also addresses the opaque "black-box" nature of deep learning models by +employing XAI methodologies, namely Grad-CAM, LIME, and SHAP, to interpret CNN +decision-making processes for healthcare professionals. Our methodology +encompasses an elaborate data preprocessing pipeline and advanced data +augmentation techniques to counteract dataset limitations, and transfer +learning using pre-trained networks, such as VGG-16, DenseNet and ResNet was +employed. A focal point of our study is the evaluation of XAI's effectiveness +in interpreting model predictions, highlighted by utilising the Hausdorff +measure to assess the alignment between AI-generated explanations and expert +annotations quantitatively. This approach plays a critical role for XAI in +promoting trustworthiness and ethical fairness in AI-assisted diagnostics. The +findings from our research illustrate the effective collaboration between CNNs +and XAI in advancing diagnostic methods for breast cancer, thereby facilitating +a more seamless integration of advanced AI technologies within clinical +settings. By enhancing the interpretability of AI-driven decisions, this work +lays the groundwork for improved collaboration between AI systems and medical +practitioners, ultimately enriching patient care. Furthermore, the implications +of our research extend well beyond the current methodologies, advocating for +subsequent inquiries into the integration of multimodal data and the refinement +of AI explanations to satisfy the needs of clinical practice. + +
+
+
+
+
+ + ☆ LiDAR-Guided Cross-Attention Fusion for Hyperspectral Band Selection and + Image Classification + + +
+ The fusion of hyperspectral and LiDAR data has been an active research topic. +Existing fusion methods have ignored the high-dimensionality and redundancy +challenges in hyperspectral images, despite that band selection methods have +been intensively studied for hyperspectral image (HSI) processing. This paper +addresses this significant gap by introducing a cross-attention mechanism from +the transformer architecture for the selection of HSI bands guided by LiDAR +data. LiDAR provides high-resolution vertical structural information, which can +be useful in distinguishing different types of land cover that may have similar +spectral signatures but different structural profiles. In our approach, the +LiDAR data are used as the "query" to search and identify the "key" from the +HSI to choose the most pertinent bands for LiDAR. This method ensures that the +selected HSI bands drastically reduce redundancy and computational requirements +while working optimally with the LiDAR data. Extensive experiments have been +undertaken on three paired HSI and LiDAR data sets: Houston 2013, Trento and +MUUFL. The results highlight the superiority of the cross-attention mechanism, +underlining the enhanced classification accuracy of the identified HSI bands +when fused with the LiDAR features. The results also show that the use of fewer +bands combined with LiDAR surpasses the performance of state-of-the-art fusion +models. + +
+
+ comment: 15 pages, 13 figures +
+
+
+
+
+ + ☆ Increasing Fairness in Classification of Out of Distribution Data for + Facial Recognition + + +
+ Standard classification theory assumes that the distribution of images in the +test and training sets are identical. Unfortunately, real-life scenarios +typically feature unseen data ("out-of-distribution data") which is different +from data in the training distribution("in-distribution"). This issue is most +prevalent in social justice problems where data from under-represented groups +may appear in the test data without representing an equal proportion of the +training data. This may result in a model returning confidently wrong decisions +and predictions. We are interested in the following question: Can the +performance of a neural network improve on facial images of out-of-distribution +data when it is trained simultaneously on multiple datasets of in-distribution +data? We approach this problem by incorporating the Outlier Exposure model and +investigate how the model's performance changes when other datasets of facial +images were implemented. We observe that the accuracy and other metrics of the +model can be increased by applying Outlier Exposure, incorporating a trainable +weight parameter to increase the machine's emphasis on outlier images, and by +re-weighting the importance of different class labels. We also experimented +with whether sorting the images and determining outliers via image features +would have more of an effect on the metrics than sorting by average pixel +value. Our goal was to make models not only more accurate but also more fair by +scanning a more expanded range of images. We also tested the datasets in +reverse order to see whether a more fair dataset with balanced features has an +effect on the model's accuracy. + +
+
+ comment: 18 pages, 6 tables, 6 figures +
+
+
+
+
+ + ☆ Mitigating Heterogeneity in Federated Multimodal Learning with + Biomedical Vision-Language Pre-training + + +
+ Vision-language pre-training (VLP) has arised as an efficient scheme for +multimodal representation learning, but it requires large-scale multimodal data +for pre-training, making it an obstacle especially for biomedical applications. +To overcome the data limitation, federated learning (FL) can be a promising +strategy to scale up the dataset for biomedical VLP while protecting data +privacy. However, client data are often heterogeneous in real-world scenarios, +and we observe that local training on heterogeneous client data would distort +the multimodal representation learning and lead to biased cross-modal +alignment. To address this challenge, we propose Federated distributional +Robust Guidance-Based (FedRGB) learning framework for federated VLP with +robustness to data heterogeneity. Specifically, we utilize a guidance-based +local training scheme to reduce feature distortions, and employ a +distribution-based min-max optimization to learn unbiased cross-modal +alignment. The experiments on real-world datasets show our method successfully +promotes efficient federated multimodal learning for biomedical VLP with data +heterogeneity. + +
+
+
+
+
+ + ♻ ☆ DVIS-DAQ: Improving Video Segmentation via Dynamic Anchor Queries + + +
+ Modern video segmentation methods adopt object queries to perform inter-frame +association and demonstrate satisfactory performance in tracking continuously +appearing objects despite large-scale motion and transient occlusion. However, +they all underperform on newly emerging and disappearing objects that are +common in the real world because they attempt to model object emergence and +disappearance through feature transitions between background and foreground +queries that have significant feature gaps. We introduce Dynamic Anchor Queries +(DAQ) to shorten the transition gap between the anchor and target queries by +dynamically generating anchor queries based on the features of potential +candidates. Furthermore, we introduce a query-level object Emergence and +Disappearance Simulation (EDS) strategy, which unleashes DAQ's potential +without any additional cost. Finally, we combine our proposed DAQ and EDS with +DVIS to obtain DVIS-DAQ. Extensive experiments demonstrate that DVIS-DAQ +achieves a new state-of-the-art (SOTA) performance on five mainstream video +segmentation benchmarks. Code and models are available at +\url{https://github.com/SkyworkAI/DAQ-VS}. + +
+
+
+
+
+ + ♻ ☆ CenterGrasp: Object-Aware Implicit Representation Learning for + Simultaneous Shape Reconstruction and 6-DoF Grasp Estimation RA-L + + +
+ Reliable object grasping is a crucial capability for autonomous robots. +However, many existing grasping approaches focus on general clutter removal +without explicitly modeling objects and thus only relying on the visible local +geometry. We introduce CenterGrasp, a novel framework that combines object +awareness and holistic grasping. CenterGrasp learns a general object prior by +encoding shapes and valid grasps in a continuous latent space. It consists of +an RGB-D image encoder that leverages recent advances to detect objects and +infer their pose and latent code, and a decoder to predict shape and grasps for +each object in the scene. We perform extensive experiments on simulated as well +as real-world cluttered scenes and demonstrate strong scene reconstruction and +6-DoF grasp-pose estimation performance. Compared to the state of the art, +CenterGrasp achieves an improvement of 38.5 mm in shape reconstruction and 33 +percentage points on average in grasp success. We make the code and trained +models publicly available at http://centergrasp.cs.uni-freiburg.de. + +
+
+ comment: Accepted at RA-L. Video, code and models available at + http://centergrasp.cs.uni-freiburg.de +
+
+
+
+
+ + ♻ ☆ Modeling 3D Surface Manifolds with a Locally Conditioned Atlas + + +
+ Recently proposed 3D object reconstruction methods represent a mesh with an +atlas - a set of planar patches approximating the surface. However, their +application in a real-world scenario is limited since the surfaces of +reconstructed objects contain discontinuities, which degrades the quality of +the final mesh. This is mainly caused by independent processing of individual +patches, and in this work, we postulate to mitigate this limitation by +preserving local consistency around patch vertices. To that end, we introduce a +Locally Conditioned Atlas (LoCondA), a framework for representing a 3D object +hierarchically in a generative model. Firstly, the model maps a point cloud of +an object into a sphere. Secondly, by leveraging a spherical prior, we enforce +the mapping to be locally consistent on the sphere and on the target object. +This way, we can sample a mesh quad on that sphere and project it back onto the +object's manifold. With LoCondA, we can produce topologically diverse objects +while maintaining quads to be stitched together. We show that the proposed +approach provides structurally coherent reconstructions while producing meshes +of quality comparable to the competitors. + +
+
+
+
+
+ + ♻ ☆ Finding AI-Generated Faces in the Wild CVPR + + +
+ AI-based image generation has continued to rapidly improve, producing +increasingly more realistic images with fewer obvious visual flaws. +AI-generated images are being used to create fake online profiles which in turn +are being used for spam, fraud, and disinformation campaigns. As the general +problem of detecting any type of manipulated or synthesized content is +receiving increasing attention, here we focus on a more narrow task of +distinguishing a real face from an AI-generated face. This is particularly +applicable when tackling inauthentic online accounts with a fake user profile +photo. We show that by focusing on only faces, a more resilient and +general-purpose artifact can be detected that allows for the detection of +AI-generated faces from a variety of GAN- and diffusion-based synthesis +engines, and across image resolutions (as low as 128 x 128 pixels) and +qualities. + +
+
+ comment: to be published as: G.J.A. Porcile, J. Gindi, S. Mundra, J.R. Verbus, + and H. Farid, Finding AI-Generated Faces in the Wild, Workshop on Media + Forensics at CVPR, 2024 +
+
+
+
+
+ + ♻ ☆ WorDepth: Variational Language Prior for Monocular Depth Estimation + + +
+ Three-dimensional (3D) reconstruction from a single image is an ill-posed +problem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text +description(s) is similarly ill-posed, i.e. spatial arrangements of objects +described. We investigate the question of whether two inherently ambiguous +modalities can be used in conjunction to produce metric-scaled reconstructions. +To test this, we focus on monocular depth estimation, the problem of predicting +a dense depth map from a single image, but with an additional text caption +describing the scene. To this end, we begin by encoding the text caption as a +mean and standard deviation; using a variational framework, we learn the +distribution of the plausible metric reconstructions of 3D scenes corresponding +to the text captions as a prior. To "select" a specific reconstruction or depth +map, we encode the given image through a conditional sampler that samples from +the latent space of the variational text encoder, which is then decoded to the +output depth map. Our approach is trained alternatingly between the text and +image branches: in one optimization step, we predict the mean and standard +deviation from the text description and sample from a standard Gaussian, and in +the other, we sample using a (image) conditional sampler. Once trained, we +directly predict depth from the encoded text using the conditional sampler. We +demonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where +we show that language can consistently improve performance in both. + +
+
+
+
+
+ + ♻ ☆ SnAG: Scalable and Accurate Video Grounding CVPR 2024 + + +
+ Temporal grounding of text descriptions in videos is a central problem in +vision-language learning and video understanding. Existing methods often +prioritize accuracy over scalability -- they have been optimized for grounding +only a few text queries within short videos, and fail to scale up to long +videos with hundreds of queries. In this paper, we study the effect of +cross-modal fusion on the scalability of video grounding models. Our analysis +establishes late fusion as a more cost-effective fusion scheme for long-form +videos with many text queries. Moreover, it leads us to a novel, video-centric +sampling scheme for efficient training. Based on these findings, we present +SnAG, a simple baseline for scalable and accurate video grounding. Without +bells and whistles, SnAG is 43% more accurate and 1.5x faster than CONE, a +state of the art for long-form video grounding on the challenging MAD dataset, +while achieving highly competitive results on short videos. + +
+
+ comment: Accepted to CVPR 2024. Code available at + https://github.com/fmu2/snag_release +
+
+
+
+
+ + ♻ ☆ State Space Models for Event Cameras CVPR 2024 + + +
+ Today, state-of-the-art deep neural networks that process event-camera data +first convert a temporal window of events into dense, grid-like input +representations. As such, they exhibit poor generalizability when deployed at +higher inference frequencies (i.e., smaller temporal windows) than the ones +they were trained on. We address this challenge by introducing state-space +models (SSMs) with learnable timescale parameters to event-based vision. This +design adapts to varying frequencies without the need to retrain the network at +different frequencies. Additionally, we investigate two strategies to +counteract aliasing effects when deploying the model at higher frequencies. We +comprehensively evaluate our approach against existing methods based on RNN and +Transformer architectures across various benchmarks, including Gen1 and 1 Mpx +event camera datasets. Our results demonstrate that SSM-based models train 33% +faster and also exhibit minimal performance degradation when tested at higher +frequencies than the training input. Traditional RNN and Transformer models +exhibit performance drops of more than 20 mAP, with SSMs having a drop of 3.31 +mAP, highlighting the effectiveness of SSMs in event-based vision tasks. + +
+
+ comment: 18 pages, 5 figures, 6 tables, CVPR 2024 Camera Ready paper +
+
+
+
+
+ + ♻ ☆ Opti-CAM: Optimizing saliency maps for interpretability + + +
+ Methods based on class activation maps (CAM) provide a simple mechanism to +interpret predictions of convolutional neural networks by using linear +combinations of feature maps as saliency maps. By contrast, masking-based +methods optimize a saliency map directly in the image space or learn it by +training another network on additional data. + In this work we introduce Opti-CAM, combining ideas from CAM-based and +masking-based approaches. Our saliency map is a linear combination of feature +maps, where weights are optimized per image such that the logit of the masked +image for a given class is maximized. We also fix a fundamental flaw in two of +the most common evaluation metrics of attribution methods. On several datasets, +Opti-CAM largely outperforms other CAM-based approaches according to the most +relevant classification metrics. We provide empirical evidence supporting that +localization and classifier interpretability are not necessarily aligned. + +
+
+ comment: This work is under consideration at "Computer Vision and Image + Understanding" +
+
+
+
+
+ + ♻ ☆ EAGLE: Eigen Aggregation Learning for Object-Centric Unsupervised + Semantic Segmentation + + +
+ Semantic segmentation has innately relied on extensive pixel-level annotated +data, leading to the emergence of unsupervised methodologies. Among them, +leveraging self-supervised Vision Transformers for unsupervised semantic +segmentation (USS) has been making steady progress with expressive deep +features. Yet, for semantically segmenting images with complex objects, a +predominant challenge remains: the lack of explicit object-level semantic +encoding in patch-level features. This technical limitation often leads to +inadequate segmentation of complex objects with diverse structures. To address +this gap, we present a novel approach, EAGLE, which emphasizes object-centric +representation learning for unsupervised semantic segmentation. Specifically, +we introduce EiCue, a spectral technique providing semantic and structural cues +through an eigenbasis derived from the semantic similarity matrix of deep image +features and color affinity from an image. Further, by incorporating our +object-centric contrastive loss with EiCue, we guide our model to learn +object-level representations with intra- and inter-image object-feature +consistency, thereby enhancing semantic accuracy. Extensive experiments on +COCO-Stuff, Cityscapes, and Potsdam-3 datasets demonstrate the state-of-the-art +USS results of EAGLE with accurate and consistent semantic segmentation across +complex scenes. + +
+
+
+
+
+ + ♻ ☆ On Inherent Adversarial Robustness of Active Vision Systems + + +
+ Current Deep Neural Networks are vulnerable to adversarial examples, which +alter their predictions by adding carefully crafted noise. Since human eyes are +robust to such inputs, it is possible that the vulnerability stems from the +standard way of processing inputs in one shot by processing every pixel with +the same importance. In contrast, neuroscience suggests that the human vision +system can differentiate salient features by (1) switching between multiple +fixation points (saccades) and (2) processing the surrounding with a +non-uniform external resolution (foveation). In this work, we advocate that the +integration of such active vision mechanisms into current deep learning systems +can offer robustness benefits. Specifically, we empirically demonstrate the +inherent robustness of two active vision methods - GFNet and FALcon - under a +black box threat model. By learning and inferencing based on downsampled +glimpses obtained from multiple distinct fixation points within an input, we +show that these active methods achieve (2-3) times greater robustness compared +to a standard passive convolutional network under state-of-the-art adversarial +attacks. More importantly, we provide illustrative and interpretable +visualization analysis that demonstrates how performing inference from distinct +fixation points makes active vision methods less vulnerable to malicious +inputs. + +
+
+
+
+
+ + ♻ ☆ SWAG: Splatting in the Wild images with Appearance-conditioned Gaussians + + +
+ Implicit neural representation methods have shown impressive advancements in +learning 3D scenes from unstructured in-the-wild photo collections but are +still limited by the large computational cost of volumetric rendering. More +recently, 3D Gaussian Splatting emerged as a much faster alternative with +superior rendering quality and training efficiency, especially for small-scale +and object-centric scenarios. Nevertheless, this technique suffers from poor +performance on unstructured in-the-wild data. To tackle this, we extend over 3D +Gaussian Splatting to handle unstructured image collections. We achieve this by +modeling appearance to seize photometric variations in the rendered images. +Additionally, we introduce a new mechanism to train transient Gaussians to +handle the presence of scene occluders in an unsupervised manner. Experiments +on diverse photo collection scenes and multi-pass acquisition of outdoor +landmarks show the effectiveness of our method over prior works achieving +state-of-the-art results with improved efficiency. + +
+
+
+
+
+ + ♻ ☆ Embedded Heterogeneous Attention Transformer for Cross-lingual Image + Captioning + + +
+ Cross-lingual image captioning is a challenging task that requires addressing +both cross-lingual and cross-modal obstacles in multimedia analysis. The +crucial issue in this task is to model the global and the local matching +between the image and different languages. Existing cross-modal embedding +methods based on the transformer architecture oversee the local matching +between the image region and monolingual words, especially when dealing with +diverse languages. To overcome these limitations, we propose an Embedded +Heterogeneous Attention Transformer (EHAT) to establish cross-domain +relationships and local correspondences between images and different languages +by using a heterogeneous network. EHAT comprises Masked Heterogeneous +Cross-attention (MHCA), Heterogeneous Attention Reasoning Network (HARN), and +Heterogeneous Co-attention (HCA). The HARN serves as the core network and it +captures cross-domain relationships by leveraging visual bounding box +representation features to connect word features from two languages and to +learn heterogeneous maps. MHCA and HCA facilitate cross-domain integration in +the encoder through specialized heterogeneous attention mechanisms, enabling a +single model to generate captions in two languages. We evaluate our approach on +the MSCOCO dataset to generate captions in English and Chinese, two languages +that exhibit significant differences in their language families. The +experimental results demonstrate the superior performance of our method +compared to existing advanced monolingual methods. Our proposed EHAT framework +effectively addresses the challenges of cross-lingual image captioning, paving +the way for improved multilingual image analysis and understanding. + +
+
+
+
+
+ + ♻ ☆ Self-Correcting Self-Consuming Loops for Generative Model Training + + +
+ As synthetic data becomes higher quality and proliferates on the internet, +machine learning models are increasingly trained on a mix of human- and +machine-generated data. Despite the successful stories of using synthetic data +for representation learning, using synthetic data for generative model training +creates "self-consuming loops" which may lead to training instability or even +collapse, unless certain conditions are met. Our paper aims to stabilize +self-consuming generative model training. Our theoretical results demonstrate +that by introducing an idealized correction function, which maps a data point +to be more likely under the true data distribution, self-consuming loops can be +made exponentially more stable. We then propose self-correction functions, +which rely on expert knowledge (e.g. the laws of physics programmed in a +simulator), and aim to approximate the idealized corrector automatically and at +scale. We empirically validate the effectiveness of self-correcting +self-consuming loops on the challenging human motion synthesis task, and +observe that it successfully avoids model collapse, even when the ratio of +synthetic data to real data is as high as 100%. + +
+
+ comment: This new version contains updated mathematical results (c.f. Remark + 4.4), as well as experiments for an additional generative modeling task. + Paper under submission; code is available at + https://nategillman.com/sc-sc.html +
+
+
+
+
+ + ♻ ☆ Chat-UniVi: Unified Visual Representation Empowers Large Language Models + with Image and Video Understanding CVPR 2024 + + +
+ Large language models have demonstrated impressive universal capabilities +across a wide range of open-ended tasks and have extended their utility to +encompass multimodal conversations. However, existing methods encounter +challenges in effectively handling both image and video understanding, +particularly with limited visual tokens. In this work, we introduce Chat-UniVi, +a Unified Vision-language model capable of comprehending and engaging in +conversations involving images and videos through a unified visual +representation. Specifically, we employ a set of dynamic visual tokens to +uniformly represent images and videos. This representation framework empowers +the model to efficiently utilize a limited number of visual tokens to +simultaneously capture the spatial details necessary for images and the +comprehensive temporal relationship required for videos. Moreover, we leverage +a multi-scale representation, enabling the model to perceive both high-level +semantic concepts and low-level visual details. Notably, Chat-UniVi is trained +on a mixed dataset containing both images and videos, allowing direct +application to tasks involving both mediums without requiring any +modifications. Extensive experimental results demonstrate that Chat-UniVi +consistently outperforms even existing methods exclusively designed for either +images or videos. Code is available at +https://github.com/PKU-YuanGroup/Chat-UniVi. + +
+
+ comment: Accepted by CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ PlatoNeRF: 3D Reconstruction in Plato's Cave via Single-View Two-Bounce + Lidar CVPR 2024 + + +
+ 3D reconstruction from a single-view is challenging because of the ambiguity +from monocular cues and lack of information about occluded regions. Neural +radiance fields (NeRF), while popular for view synthesis and 3D reconstruction, +are typically reliant on multi-view images. Existing methods for single-view 3D +reconstruction with NeRF rely on either data priors to hallucinate views of +occluded regions, which may not be physically accurate, or shadows observed by +RGB cameras, which are difficult to detect in ambient light and low albedo +backgrounds. We propose using time-of-flight data captured by a single-photon +avalanche diode to overcome these limitations. Our method models two-bounce +optical paths with NeRF, using lidar transient data for supervision. By +leveraging the advantages of both NeRF and two-bounce light measured by lidar, +we demonstrate that we can reconstruct visible and occluded geometry without +data priors or reliance on controlled ambient lighting or scene albedo. In +addition, we demonstrate improved generalization under practical constraints on +sensor spatial- and temporal-resolution. We believe our method is a promising +direction as single-photon lidars become ubiquitous on consumer devices, such +as phones, tablets, and headsets. + +
+
+ comment: CVPR 2024. Project Page: https://platonerf.github.io/ +
+
+
+
+
+ + ♻ ☆ Plug-and-Play image restoration with Stochastic deNOising REgularization + + +
+ Plug-and-Play (PnP) algorithms are a class of iterative algorithms that +address image inverse problems by combining a physical model and a deep neural +network for regularization. Even if they produce impressive image restoration +results, these algorithms rely on a non-standard use of a denoiser on images +that are less and less noisy along the iterations, which contrasts with recent +algorithms based on Diffusion Models (DM), where the denoiser is applied only +on re-noised images. We propose a new PnP framework, called Stochastic +deNOising REgularization (SNORE), which applies the denoiser only on images +with noise of the adequate level. It is based on an explicit stochastic +regularization, which leads to a stochastic gradient descent algorithm to solve +ill-posed inverse problems. A convergence analysis of this algorithm and its +annealing extension is provided. Experimentally, we prove that SNORE is +competitive with respect to state-of-the-art methods on deblurring and +inpainting tasks, both quantitatively and qualitatively. + +
+
+
+
+
+ + ♻ ☆ EGTR: Extracting Graph from Transformer for Scene Graph Generation CVPR 2024 + + +
+ Scene Graph Generation (SGG) is a challenging task of detecting objects and +predicting relationships between objects. After DETR was developed, one-stage +SGG models based on a one-stage object detector have been actively studied. +However, complex modeling is used to predict the relationship between objects, +and the inherent relationship between object queries learned in the multi-head +self-attention of the object detector has been neglected. We propose a +lightweight one-stage SGG model that extracts the relation graph from the +various relationships learned in the multi-head self-attention layers of the +DETR decoder. By fully utilizing the self-attention by-products, the relation +graph can be extracted effectively with a shallow relation extraction head. +Considering the dependency of the relation extraction task on the object +detection task, we propose a novel relation smoothing technique that adjusts +the relation label adaptively according to the quality of the detected objects. +By the relation smoothing, the model is trained according to the continuous +curriculum that focuses on object detection task at the beginning of training +and performs multi-task learning as the object detection performance gradually +improves. Furthermore, we propose a connectivity prediction task that predicts +whether a relation exists between object pairs as an auxiliary task of the +relation extraction. We demonstrate the effectiveness and efficiency of our +method for the Visual Genome and Open Image V6 datasets. Our code is publicly +available at https://github.com/naver-ai/egtr. + +
+
+ comment: CVPR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ Open-vocabulary object 6D pose estimation CVPR 2024 + + +
+ We introduce the new setting of open-vocabulary object 6D pose estimation, in +which a textual prompt is used to specify the object of interest. In contrast +to existing approaches, in our setting (i) the object of interest is specified +solely through the textual prompt, (ii) no object model (e.g., CAD or video +sequence) is required at inference, and (iii) the object is imaged from two +RGBD viewpoints of different scenes. To operate in this setting, we introduce a +novel approach that leverages a Vision-Language Model to segment the object of +interest from the scenes and to estimate its relative 6D pose. The key of our +approach is a carefully devised strategy to fuse object-level information +provided by the prompt with local image features, resulting in a feature space +that can generalize to novel concepts. We validate our approach on a new +benchmark based on two popular datasets, REAL275 and Toyota-Light, which +collectively encompass 34 object instances appearing in four thousand image +pairs. The results demonstrate that our approach outperforms both a +well-established hand-crafted method and a recent deep learning-based baseline +in estimating the relative 6D pose of objects in different scenes. Code and +dataset are available at https://jcorsetti.github.io/oryon. + +
+
+ comment: Camera ready version (CVPR 2024, poster highlight). 21 pages, 15 + figures, 6 tables +
+
+
+
+
+ + ♻ ☆ The Missing U for Efficient Diffusion Models + + +
+ Diffusion Probabilistic Models stand as a critical tool in generative +modelling, enabling the generation of complex data distributions. This family +of generative models yields record-breaking performance in tasks such as image +synthesis, video generation, and molecule design. Despite their capabilities, +their efficiency, especially in the reverse process, remains a challenge due to +slow convergence rates and high computational costs. In this paper, we +introduce an approach that leverages continuous dynamical systems to design a +novel denoising network for diffusion models that is more parameter-efficient, +exhibits faster convergence, and demonstrates increased noise robustness. +Experimenting with Denoising Diffusion Probabilistic Models (DDPMs), our +framework operates with approximately a quarter of the parameters, and $\sim$ +30\% of the Floating Point Operations (FLOPs) compared to standard U-Nets in +DDPMs. Furthermore, our model is notably faster in inference than the baseline +when measured in fair and equal conditions. We also provide a mathematical +intuition as to why our proposed reverse process is faster as well as a +mathematical discussion of the empirical tradeoffs in the denoising downstream +task. Finally, we argue that our method is compatible with existing performance +enhancement techniques, enabling further improvements in efficiency, quality, +and speed. + +
+
+ comment: 23 pages, 14 figures, Accepted at Transactions of Machine Learning + Research (04/2024) +
+
+
+
+
+ + ♻ ☆ DualRefine: Self-Supervised Depth and Pose Estimation Through Iterative + Epipolar Sampling and Refinement Toward Equilibrium CVPR 2023 + + +
+ Self-supervised multi-frame depth estimation achieves high accuracy by +computing matching costs of pixel correspondences between adjacent frames, +injecting geometric information into the network. These pixel-correspondence +candidates are computed based on the relative pose estimates between the +frames. Accurate pose predictions are essential for precise matching cost +computation as they influence the epipolar geometry. Furthermore, improved +depth estimates can, in turn, be used to align pose estimates. + Inspired by traditional structure-from-motion (SfM) principles, we propose +the DualRefine model, which tightly couples depth and pose estimation through a +feedback loop. Our novel update pipeline uses a deep equilibrium model +framework to iteratively refine depth estimates and a hidden state of feature +maps by computing local matching costs based on epipolar geometry. Importantly, +we used the refined depth estimates and feature maps to compute pose updates at +each step. This update in the pose estimates slowly alters the epipolar +geometry during the refinement process. Experimental results on the KITTI +dataset demonstrate competitive depth prediction and odometry prediction +performance surpassing published self-supervised baselines. + +
+
+ comment: CVPR 2023. Project page: + https://antabangun.github.io/projects/DualRefine/ Code: + https://github.com/antabangun/DualRefine +
+
+
+
+
+ + ♻ ☆ Neural Sign Actors: A diffusion model for 3D sign language production + from text CVPR 2024 + + +
+ Sign Languages (SL) serve as the primary mode of communication for the Deaf +and Hard of Hearing communities. Deep learning methods for SL recognition and +translation have achieved promising results. However, Sign Language Production +(SLP) poses a challenge as the generated motions must be realistic and have +precise semantic meaning. Most SLP methods rely on 2D data, which hinders their +realism. In this work, a diffusion-based SLP model is trained on a curated +large-scale dataset of 4D signing avatars and their corresponding text +transcripts. The proposed method can generate dynamic sequences of 3D avatars +from an unconstrained domain of discourse using a diffusion process formed on a +novel and anatomically informed graph neural network defined on the SMPL-X body +skeleton. Through quantitative and qualitative experiments, we show that the +proposed method considerably outperforms previous methods of SLP. This work +makes an important step towards realistic neural sign avatars, bridging the +communication gap between Deaf and hearing communities. + +
+
+ comment: Accepted at CVPR 2024, Project page: + https://baltatzisv.github.io/neural-sign-actors/ +
+
+
+
+
+ + ♻ ☆ Localization Is All You Evaluate: Data Leakage in Online Mapping + Datasets and How to Fix It + + +
+ The task of online mapping is to predict a local map using current sensor +observations, e.g. from lidar and camera, without relying on a pre-built map. +State-of-the-art methods are based on supervised learning and are trained +predominantly using two datasets: nuScenes and Argoverse 2. However, these +datasets revisit the same geographic locations across training, validation, and +test sets. Specifically, over $80$% of nuScenes and $40$% of Argoverse 2 +validation and test samples are less than $5$ m from a training sample. At test +time, the methods are thus evaluated more on how well they localize within a +memorized implicit map built from the training data than on extrapolating to +unseen locations. Naturally, this data leakage causes inflated performance +numbers and we propose geographically disjoint data splits to reveal the true +performance in unseen environments. Experimental results show that methods +perform considerably worse, some dropping more than $45$ mAP, when trained and +evaluated on proper data splits. Additionally, a reassessment of prior design +choices reveals diverging conclusions from those based on the original split. +Notably, the impact of lifting methods and the support from auxiliary tasks +(e.g., depth supervision) on performance appears less substantial or follows a +different trajectory than previously perceived. Splits can be found at +https://github.com/LiljaAdam/geographical-splits + +
+
+
+
+
+ + ♻ ☆ Contextual Encoder-Decoder Network for Visual Saliency Prediction + + +
+ Predicting salient regions in natural images requires the detection of +objects that are present in a scene. To develop robust representations for this +challenging task, high-level visual features at multiple spatial scales must be +extracted and augmented with contextual information. However, existing models +aimed at explaining human fixation maps do not incorporate such a mechanism +explicitly. Here we propose an approach based on a convolutional neural network +pre-trained on a large-scale image classification task. The architecture forms +an encoder-decoder structure and includes a module with multiple convolutional +layers at different dilation rates to capture multi-scale features in parallel. +Moreover, we combine the resulting representations with global scene +information for accurately predicting visual saliency. Our model achieves +competitive and consistent results across multiple evaluation metrics on two +public saliency benchmarks and we demonstrate the effectiveness of the +suggested approach on five datasets and selected examples. Compared to state of +the art approaches, the network is based on a lightweight image classification +backbone and hence presents a suitable choice for applications with limited +computational resources, such as (virtual) robotic systems, to estimate human +fixations across complex natural scenes. + +
+
+ comment: Updated contact information +
+
+
+
+
+ + ♻ ☆ Single Domain Generalization for Crowd Counting CVPR2024 + + +
+ Due to its promising results, density map regression has been widely employed +for image-based crowd counting. The approach, however, often suffers from +severe performance degradation when tested on data from unseen scenarios, the +so-called "domain shift" problem. To address the problem, we investigate in +this work single domain generalization (SDG) for crowd counting. The existing +SDG approaches are mainly for image classification and segmentation, and can +hardly be extended to our case due to its regression nature and label ambiguity +(i.e., ambiguous pixel-level ground truths). We propose MPCount, a novel +effective SDG approach even for narrow source distribution. MPCount stores +diverse density values for density map regression and reconstructs +domain-invariant features by means of only one memory bank, a content error +mask and attention consistency loss. By partitioning the image into grids, it +employs patch-wise classification as an auxiliary task to mitigate label +ambiguity. Through extensive experiments on different datasets, MPCount is +shown to significantly improve counting accuracy compared to the state of the +art under diverse scenarios unobserved in the training data characterized by +narrow source distribution. Code is available at +https://github.com/Shimmer93/MPCount. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ One model to use them all: Training a segmentation model with + complementary datasets + + +
+ Understanding a surgical scene is crucial for computer-assisted surgery +systems to provide any intelligent assistance functionality. One way of +achieving this scene understanding is via scene segmentation, where every pixel +of a frame is classified and therefore identifies the visible structures and +tissues. Progress on fully segmenting surgical scenes has been made using +machine learning. However, such models require large amounts of annotated +training data, containing examples of all relevant object classes. Such fully +annotated datasets are hard to create, as every pixel in a frame needs to be +annotated by medical experts and, therefore, are rarely available. In this +work, we propose a method to combine multiple partially annotated datasets, +which provide complementary annotations, into one model, enabling better scene +segmentation and the use of multiple readily available datasets. Our method +aims to combine available data with complementary labels by leveraging mutual +exclusive properties to maximize information. Specifically, we propose to use +positive annotations of other classes as negative samples and to exclude +background pixels of binary annotations, as we cannot tell if they contain a +class not annotated but predicted by the model. We evaluate our method by +training a DeepLabV3 on the publicly available Dresden Surgical Anatomy +Dataset, which provides multiple subsets of binary segmented anatomical +structures. Our approach successfully combines 6 classes into one model, +increasing the overall Dice Score by 4.4% compared to an ensemble of models +trained on the classes individually. By including information on multiple +classes, we were able to reduce confusion between stomach and colon by 24%. Our +results demonstrate the feasibility of training a model on multiple datasets. +This paves the way for future work further alleviating the need for one large, +fully segmented datasets. + +
+
+ comment: Accepted at IPCAI 2024; submitted to IJCARS (under revision) +
+
+
+
+
+ + ♻ ☆ Part-Attention Based Model Make Occluded Person Re-Identification + Stronger + + +
+ The goal of occluded person re-identification (ReID) is to retrieve specific +pedestrians in occluded situations. However, occluded person ReID still suffers +from background clutter and low-quality local feature representations, which +limits model performance. In our research, we introduce a new framework called +PAB-ReID, which is a novel ReID model incorporating part-attention mechanisms +to tackle the aforementioned issues effectively. Firstly, we introduce the +human parsing label to guide the generation of more accurate human part +attention maps. In addition, we propose a fine-grained feature focuser for +generating fine-grained human local feature representations while suppressing +background interference. Moreover, We also design a part triplet loss to +supervise the learning of human local features, which optimizes +intra/inter-class distance. We conducted extensive experiments on specialized +occlusion and regular ReID datasets, showcasing that our approach outperforms +the existing state-of-the-art methods. + +
+
+ comment: Accepted By International Joint Conference on Neural Networks 2024 +
+
+
+
+
+ + ♻ ☆ OMH: Structured Sparsity via Optimally Matched Hierarchy for + Unsupervised Semantic Segmentation + + +
+ Unsupervised Semantic Segmentation (USS) involves segmenting images without +relying on predefined labels, aiming to alleviate the burden of extensive human +labeling. Existing methods utilize features generated by self-supervised models +and specific priors for clustering. However, their clustering objectives are +not involved in the optimization of the features during training. Additionally, +due to the lack of clear class definitions in USS, the resulting segments may +not align well with the clustering objective. In this paper, we introduce a +novel approach called Optimally Matched Hierarchy (OMH) to simultaneously +address the above issues. The core of our method lies in imposing structured +sparsity on the feature space, which allows the features to encode information +with different levels of granularity. The structure of this sparsity stems from +our hierarchy (OMH). To achieve this, we learn a soft but sparse hierarchy +among parallel clusters through Optimal Transport. Our OMH yields better +unsupervised segmentation performance compared to existing USS methods. Our +extensive experiments demonstrate the benefits of OMH when utilizing our +differentiable paradigm. We will make our code publicly available. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ SADA: Semantic adversarial unsupervised domain adaptation for Temporal + Action Localization + + +
+ Temporal Action Localization (TAL) is a complex task that poses relevant +challenges, particularly when attempting to generalize on new -- unseen -- +domains in real-world applications. These scenarios, despite realistic, are +often neglected in the literature, exposing these solutions to important +performance degradation. In this work, we tackle this issue by introducing, for +the first time, an approach for Unsupervised Domain Adaptation (UDA) in sparse +TAL, which we refer to as Semantic Adversarial unsupervised Domain Adaptation +(SADA). Our contributions are threefold: (1) we pioneer the development of a +domain adaptation model that operates on realistic sparse action detection +benchmarks; (2) we tackle the limitations of global-distribution alignment +techniques by introducing a novel adversarial loss that is sensitive to local +class distributions, ensuring finer-grained adaptation; and (3) we present a +novel set of benchmarks based on EpicKitchens100 and CharadesEgo, that evaluate +multiple domain shifts in a comprehensive manner. Our experiments indicate that +SADA improves the adaptation across domains when compared to fully supervised +state-of-the-art and alternative UDA methods, attaining a performance boost of +up to 6.14% mAP. + +
+
+
+
+
+ + ♻ ☆ SCILLA: SurfaCe Implicit Learning for Large Urban Area, a volumetric + hybrid solution + + +
+ Neural implicit surface representation methods have recently shown impressive +3D reconstruction results. However, existing solutions struggle to reconstruct +urban outdoor scenes due to their large, unbounded, and highly detailed nature. +Hence, to achieve accurate reconstructions, additional supervision data such as +LiDAR, strong geometric priors, and long training times are required. To tackle +such issues, we present SCILLA, a new hybrid implicit surface learning method +to reconstruct large driving scenes from 2D images. SCILLA's hybrid +architecture models two separate implicit fields: one for the volumetric +density and another for the signed distance to the surface. To accurately +represent urban outdoor scenarios, we introduce a novel volume-rendering +strategy that relies on self-supervised probabilistic density estimation to +sample points near the surface and transition progressively from volumetric to +surface representation. Our solution permits a proper and fast initialization +of the signed distance field without relying on any geometric prior on the +scene, compared to concurrent methods. By conducting extensive experiments on +four outdoor driving datasets, we show that SCILLA can learn an accurate and +detailed 3D surface scene representation in various urban scenarios while being +two times faster to train compared to previous state-of-the-art solutions. + +
+
+
+
+
+ + ♻ ☆ QuickQuakeBuildings: Post-earthquake SAR-Optical Dataset for Quick + Damaged-building Detection + + +
+ Quick and automated earthquake-damaged building detection from post-event +satellite imagery is crucial, yet it is challenging due to the scarcity of +training data required to develop robust algorithms. This letter presents the +first dataset dedicated to detecting earthquake-damaged buildings from +post-event very high resolution (VHR) Synthetic Aperture Radar (SAR) and +optical imagery. Utilizing open satellite imagery and annotations acquired +after the 2023 Turkey-Syria earthquakes, we deliver a dataset of coregistered +building footprints and satellite image patches of both SAR and optical data, +encompassing more than four thousand buildings. The task of damaged building +detection is formulated as a binary image classification problem, that can also +be treated as an anomaly detection problem due to extreme class imbalance. We +provide baseline methods and results to serve as references for comparison. +Researchers can utilize this dataset to expedite algorithm development, +facilitating the rapid detection of damaged buildings in response to future +events. The dataset and codes together with detailed explanations and +visualization are made publicly available at +\url{https://github.com/ya0-sun/PostEQ-SARopt-BuildingDamage}. + +
+
+
+
+
+ + ♻ ☆ SPOT: Self-Training with Patch-Order Permutation for Object-Centric + Learning with Autoregressive Transformers CVPR 2024 + + +
+ Unsupervised object-centric learning aims to decompose scenes into +interpretable object entities, termed slots. Slot-based auto-encoders stand out +as a prominent method for this task. Within them, crucial aspects include +guiding the encoder to generate object-specific slots and ensuring the decoder +utilizes them during reconstruction. This work introduces two novel techniques, +(i) an attention-based self-training approach, which distills superior +slot-based attention masks from the decoder to the encoder, enhancing object +segmentation, and (ii) an innovative patch-order permutation strategy for +autoregressive transformers that strengthens the role of slot vectors in +reconstruction. The effectiveness of these strategies is showcased +experimentally. The combined approach significantly surpasses prior slot-based +autoencoder methods in unsupervised object segmentation, especially with +complex real-world images. We provide the implementation code at +https://github.com/gkakogeorgiou/spot . + +
+
+ comment: CVPR 2024 (Highlight). Code: https://github.com/gkakogeorgiou/spot +
+
+
+
+
+ + ♻ ☆ Learning Enriched Features via Selective State Spaces Model for + Efficient Image Deblurring + + +
+ Image deblurring aims to restore a high-quality image from its corresponding +blurred. The emergence of CNNs and Transformers has enabled significant +progress. However, these methods often face the dilemma between eliminating +long-range degradation perturbations and maintaining computational efficiency. +While the selective state space model (SSM) shows promise in modeling +long-range dependencies with linear complexity, it also encounters challenges +such as local pixel forgetting and channel redundancy. To address this issue, +we propose an efficient image deblurring network that leverages selective state +spaces model to aggregate enriched and accurate features. Specifically, we +introduce an aggregate local and global information block (ALGBlock) designed +to effectively capture and integrate both local invariant properties and +non-local information. The ALGBlock comprises two primary modules: a module for +capturing local and global features (CLGF), and a feature aggregation module +(FA). The CLGF module is composed of two branches: the global branch captures +long-range dependency features via a selective state spaces model, while the +local branch employs simplified channel attention to model local connectivity, +thereby reducing local pixel forgetting and channel redundancy. In addition, we +design a FA module to accentuate the local part by recalibrating the weight +during the aggregation of the two branches for restoration. Experimental +results demonstrate that the proposed method outperforms state-of-the-art +approaches on widely used benchmarks. + +
+
+
+
+
+ + ♻ ☆ Sculpting Holistic 3D Representation in Contrastive Language-Image-3D + Pre-training CVPR 2024 + + +
+ Contrastive learning has emerged as a promising paradigm for 3D open-world +understanding, i.e., aligning point cloud representation to image and text +embedding space individually. In this paper, we introduce MixCon3D, a simple +yet effective method aiming to sculpt holistic 3D representation in contrastive +language-image-3D pre-training. In contrast to point cloud only, we develop the +3D object-level representation from complementary perspectives, e.g., +multi-view rendered images with the point cloud. Then, MixCon3D performs +language-3D contrastive learning, comprehensively depicting real-world 3D +objects and bolstering text alignment. Additionally, we pioneer the first +thorough investigation of various training recipes for the 3D contrastive +learning paradigm, building a solid baseline with improved performance. +Extensive experiments conducted on three representative benchmarks reveal that +our method significantly improves over the baseline, surpassing the previous +state-of-the-art performance on the challenging 1,156-category Objaverse-LVIS +dataset by 5.7%. The versatility of MixCon3D is showcased in applications such +as text-to-3D retrieval and point cloud captioning, further evidencing its +efficacy in diverse scenarios. The code is available at +https://github.com/UCSC-VLAA/MixCon3D. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ MO-YOLO: End-to-End Multiple-Object Tracking Method with YOLO and + Decoder + + +
+ In the field of multi-object tracking (MOT), recent Transformer based +end-to-end models like MOTR have demonstrated exceptional performance on +datasets such as DanceTracker. However, the computational demands of these +models present challenges in training and deployment. Drawing inspiration from +successful models like GPT, we present MO-YOLO, an efficient and +computationally frugal end-to-end MOT model. MO-YOLO integrates principles from +You Only Look Once (YOLO) and RT-DETR, adopting a decoder-only approach. By +leveraging the decoder from RT-DETR and architectural components from YOLOv8, +MO-YOLO achieves high speed, shorter training times, and proficient MOT +performance. On the Dancetrack, MO-YOLO not only matches MOTR's performance but +also surpasses it, achieving over twice the frames per second (MOTR 9.5 FPS, +MO-YOLO 19.6 FPS). Furthermore, MO-YOLO demonstrates significantly reduced +training times and lower hardware requirements compared to MOTR. This research +introduces a promising paradigm for efficient end-to-end MOT, emphasizing +enhanced performance and resource efficiency. + +
+
+
+
+
+ + ♻ ☆ GaussianCube: Structuring Gaussian Splatting using Optimal Transport for + 3D Generative Modeling + + +
+ 3D Gaussian Splatting (GS) have achieved considerable improvement over Neural +Radiance Fields in terms of 3D fitting fidelity and rendering speed. However, +this unstructured representation with scattered Gaussians poses a significant +challenge for generative modeling. To address the problem, we introduce +GaussianCube, a structured GS representation that is both powerful and +efficient for generative modeling. We achieve this by first proposing a +modified densification-constrained GS fitting algorithm which can yield +high-quality fitting results using a fixed number of free Gaussians, and then +re-arranging the Gaussians into a predefined voxel grid via Optimal Transport. +The structured grid representation allows us to use standard 3D U-Net as our +backbone in diffusion generative modeling without elaborate designs. Extensive +experiments conducted on ShapeNet and OmniObject3D show that our model achieves +state-of-the-art generation results both qualitatively and quantitatively, +underscoring the potential of GaussianCube as a powerful and versatile 3D +representation. + +
+
+ comment: Fix typo in Eq.2; Project Page: https://gaussiancube.github.io/ +
+
+
+
+
+ + ♻ ☆ Mind the Exit Pupil Gap: Revisiting the Intrinsics of a Standard + Plenoptic Camera + + +
+ Among the common applications of plenoptic cameras are depth reconstruction +and post-shot refocusing. These require a calibration relating the camera-side +light field to that of the scene. Numerous methods with this goal have been +developed based on thin lens models for the plenoptic camera's main lens and +microlenses. Our work addresses the often-overlooked role of the main lens exit +pupil in these models and specifically in the decoding process of standard +plenoptic camera (SPC) images. We formally deduce the connection between the +refocusing distance and the resampling parameter for the decoded light field +and provide an analysis of the errors that arise when the exit pupil is not +considered. In addition, previous work is revisited with respect to the exit +pupil's role and all theoretical results are validated through a +ray-tracing-based simulation. With the public release of the evaluated SPC +designs alongside our simulation and experimental data we aim to contribute to +a more accurate and nuanced understanding of plenoptic camera optics. + +
+
+ comment: 29 pages, 16 figures, Accepted for publication in MDPI Sensors, + Special Issue 'Short-Range Optical 3D Scanning and 3D Data Processing ' +
+
+
+
+
+ + ♻ ☆ Theoretical and Empirical Analysis of a Fast Algorithm for Extracting + Polygons from Signed Distance Bounds + + +
+ Recently there has been renewed interest in signed distance bound +representations due to their unique properties for 3D shape modelling. This is +especially the case for deep learning-based bounds. However, it is beneficial +to work with polygons in most computer-graphics applications. Thus, in this +paper we introduce and investigate an asymptotically fast method for +transforming signed distance bounds into polygon meshes. This is achieved by +combining the principles of sphere tracing (or ray marching) with traditional +polygonization techniques, such as Marching Cubes. We provide theoretical and +experimental evidence that this approach is of the $O(N^2\log N)$ computational +complexity for a polygonization grid with $N^3$ cells. The algorithm is tested +on both a set of primitive shapes as well as signed distance bounds generated +from point clouds by machine learning (and represented as neural networks). +Given its speed, implementation simplicity and portability, we argue that it +could prove useful during the modelling stage as well as in shape compression +for storage. + The code is available here: https://github.com/nenadmarkus/gridhopping + +
+
+
+
+
+ + ♻ ☆ InstantAvatar: Efficient 3D Head Reconstruction via Surface Rendering + + +
+ Recent advances in full-head reconstruction have been obtained by optimizing +a neural field through differentiable surface or volume rendering to represent +a single scene. While these techniques achieve an unprecedented accuracy, they +take several minutes, or even hours, due to the expensive optimization process +required. In this work, we introduce InstantAvatar, a method that recovers +full-head avatars from few images (down to just one) in a few seconds on +commodity hardware. In order to speed up the reconstruction process, we propose +a system that combines, for the first time, a voxel-grid neural field +representation with a surface renderer. Notably, a naive combination of these +two techniques leads to unstable optimizations that do not converge to valid +solutions. In order to overcome this limitation, we present a novel statistical +model that learns a prior distribution over 3D head signed distance functions +using a voxel-grid based architecture. The use of this prior model, in +combination with other design choices, results into a system that achieves 3D +head reconstructions with comparable accuracy as the state-of-the-art with a +100x speed-up. + +
+
+
+
+
+ + ♻ ☆ EVREAL: Towards a Comprehensive Benchmark and Analysis Suite for + Event-based Video Reconstruction CVPR + + +
+ Event cameras are a new type of vision sensor that incorporates asynchronous +and independent pixels, offering advantages over traditional frame-based +cameras such as high dynamic range and minimal motion blur. However, their +output is not easily understandable by humans, making the reconstruction of +intensity images from event streams a fundamental task in event-based vision. +While recent deep learning-based methods have shown promise in video +reconstruction from events, this problem is not completely solved yet. To +facilitate comparison between different approaches, standardized evaluation +protocols and diverse test datasets are essential. This paper proposes a +unified evaluation methodology and introduces an open-source framework called +EVREAL to comprehensively benchmark and analyze various event-based video +reconstruction methods from the literature. Using EVREAL, we give a detailed +analysis of the state-of-the-art methods for event-based video reconstruction, +and provide valuable insights into the performance of these methods under +varying settings, challenging scenarios, and downstream tasks. + +
+
+ comment: 19 pages, 9 figures. Has been accepted for publication at the IEEE + Conference on Computer Vision and Pattern Recognition Workshops (CVPRW), + Vancouver, 2023. The project page can be found at + https://ercanburak.github.io/evreal.html +
+
+
+
+
+ + ♻ ☆ Causal Mode Multiplexer: A Novel Framework for Unbiased Multispectral + Pedestrian Detection CVPR2024 + + +
+ RGBT multispectral pedestrian detection has emerged as a promising solution +for safety-critical applications that require day/night operations. However, +the modality bias problem remains unsolved as multispectral pedestrian +detectors learn the statistical bias in datasets. Specifically, datasets in +multispectral pedestrian detection mainly distribute between ROTO (day) and +RXTO (night) data; the majority of the pedestrian labels statistically co-occur +with their thermal features. As a result, multispectral pedestrian detectors +show poor generalization ability on examples beyond this statistical +correlation, such as ROTX data. To address this problem, we propose a novel +Causal Mode Multiplexer (CMM) framework that effectively learns the causalities +between multispectral inputs and predictions. Moreover, we construct a new +dataset (ROTX-MP) to evaluate modality bias in multispectral pedestrian +detection. ROTX-MP mainly includes ROTX examples not presented in previous +datasets. Extensive experiments demonstrate that our proposed CMM framework +generalizes well on existing datasets (KAIST, CVC-14, FLIR) and the new +ROTX-MP. We will release our new dataset to the public for future research. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ Learning Prompt with Distribution-Based Feature Replay for Few-Shot + Class-Incremental Learning + + +
+ Few-shot Class-Incremental Learning (FSCIL) aims to continuously learn new +classes based on very limited training data without forgetting the old ones +encountered. Existing studies solely relied on pure visual networks, while in +this paper we solved FSCIL by leveraging the Vision-Language model (e.g., CLIP) +and propose a simple yet effective framework, named Learning Prompt with +Distribution-based Feature Replay (LP-DiF). We observe that simply using CLIP +for zero-shot evaluation can substantially outperform the most influential +methods. Then, prompt tuning technique is involved to further improve its +adaptation ability, allowing the model to continually capture specific +knowledge from each session. To prevent the learnable prompt from forgetting +old knowledge in the new session, we propose a pseudo-feature replay approach. +Specifically, we preserve the old knowledge of each class by maintaining a +feature-level Gaussian distribution with a diagonal covariance matrix, which is +estimated by the image features of training images and synthesized features +generated from a VAE. When progressing to a new session, pseudo-features are +sampled from old-class distributions combined with training images of the +current session to optimize the prompt, thus enabling the model to learn new +knowledge while retaining old knowledge. Experiments on three prevalent +benchmarks, i.e., CIFAR100, mini-ImageNet, CUB-200, and two more challenging +benchmarks, i.e., SUN-397 and CUB-200$^*$ proposed in this paper showcase the +superiority of LP-DiF, achieving new state-of-the-art (SOTA) in FSCIL. Code is +publicly available at https://github.com/1170300714/LP-DiF. + +
+
+
+
+
+ + ♻ ☆ Your Student is Better Than Expected: Adaptive Teacher-Student + Collaboration for Text-Conditional Diffusion Models CVPR2024 + + +
+ Knowledge distillation methods have recently shown to be a promising +direction to speedup the synthesis of large-scale diffusion models by requiring +only a few inference steps. While several powerful distillation methods were +recently proposed, the overall quality of student samples is typically lower +compared to the teacher ones, which hinders their practical usage. In this +work, we investigate the relative quality of samples produced by the teacher +text-to-image diffusion model and its distilled student version. As our main +empirical finding, we discover that a noticeable portion of student samples +exhibit superior fidelity compared to the teacher ones, despite the +"approximate" nature of the student. Based on this finding, we propose an +adaptive collaboration between student and teacher diffusion models for +effective text-to-image synthesis. Specifically, the distilled model produces +the initial sample, and then an oracle decides whether it needs further +improvements with a slow teacher model. Extensive experiments demonstrate that +the designed pipeline surpasses state-of-the-art text-to-image alternatives for +various inference budgets in terms of human preference. Furthermore, the +proposed approach can be naturally used in popular applications such as +text-guided image editing and controllable generation. + +
+
+ comment: CVPR2024 camera ready v2 +
+
+
+
+
+ + ♻ ☆ Predicting Traffic Flow with Federated Learning and Graph Neural with + Asynchronous Computations Network + + +
+ Real-time traffic flow prediction holds significant importance within the +domain of Intelligent Transportation Systems (ITS). The task of achieving a +balance between prediction precision and computational efficiency presents a +significant challenge. In this article, we present a novel deep-learning method +called Federated Learning and Asynchronous Graph Convolutional Network +(FLAGCN). Our framework incorporates the principles of asynchronous graph +convolutional networks with federated learning to enhance the accuracy and +efficiency of real-time traffic flow prediction. The FLAGCN model employs a +spatial-temporal graph convolution technique to asynchronously address +spatio-temporal dependencies within traffic data effectively. To efficiently +handle the computational requirements associated with this deep learning model, +this study used a graph federated learning technique known as GraphFL. This +approach is designed to facilitate the training process. The experimental +results obtained from conducting tests on two distinct traffic datasets +demonstrate that the utilization of FLAGCN leads to the optimization of both +training and inference durations while maintaining a high level of prediction +accuracy. FLAGCN outperforms existing models with significant improvements by +achieving up to approximately 6.85% reduction in RMSE, 20.45% reduction in +MAPE, compared to the best-performing existing models. + +
+
+ comment: I request to withdraw my paper from arXiv due to significant updates + and improvements identified post-submission. These enhancements will + substantially elevate the work's quality and impact. I plan to resubmit the + revised paper upon completion of these updates. Thank you for accommodating + this request +
+
+
+
+
+ + ♻ ☆ Generalizable Whole Slide Image Classification with Fine-Grained + Visual-Semantic Interaction CVPR 2024 + + +
+ Whole Slide Image (WSI) classification is often formulated as a Multiple +Instance Learning (MIL) problem. Recently, Vision-Language Models (VLMs) have +demonstrated remarkable performance in WSI classification. However, existing +methods leverage coarse-grained pathogenetic descriptions for visual +representation supervision, which are insufficient to capture the complex +visual appearance of pathogenetic images, hindering the generalizability of +models on diverse downstream tasks. Additionally, processing high-resolution +WSIs can be computationally expensive. In this paper, we propose a novel +"Fine-grained Visual-Semantic Interaction" (FiVE) framework for WSI +classification. It is designed to enhance the model's generalizability by +leveraging the interaction between localized visual patterns and fine-grained +pathological semantics. Specifically, with meticulously designed queries, we +start by utilizing a large language model to extract fine-grained pathological +descriptions from various non-standardized raw reports. The output descriptions +are then reconstructed into fine-grained labels used for training. By +introducing a Task-specific Fine-grained Semantics (TFS) module, we enable +prompts to capture crucial visual information in WSIs, which enhances +representation learning and augments generalization capabilities significantly. +Furthermore, given that pathological visual patterns are redundantly +distributed across tissue slices, we sample a subset of visual instances during +training. Our method demonstrates robust generalizability and strong +transferability, dominantly outperforming the counterparts on the TCGA Lung +Cancer dataset with at least 9.19% higher accuracy in few-shot experiments. The +code is available at: https://github.com/ls1rius/WSI_FiVE. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ Dynamic Adapter Meets Prompt Tuning: Parameter-Efficient Transfer + Learning for Point Cloud Analysis CVPR 2024 + + +
+ Point cloud analysis has achieved outstanding performance by transferring +point cloud pre-trained models. However, existing methods for model adaptation +usually update all model parameters, i.e., full fine-tuning paradigm, which is +inefficient as it relies on high computational costs (e.g., training GPU +memory) and massive storage space. In this paper, we aim to study +parameter-efficient transfer learning for point cloud analysis with an ideal +trade-off between task performance and parameter efficiency. To achieve this +goal, we freeze the parameters of the default pre-trained models and then +propose the Dynamic Adapter, which generates a dynamic scale for each token, +considering the token significance to the downstream task. We further +seamlessly integrate Dynamic Adapter with Prompt Tuning (DAPT) by constructing +Internal Prompts, capturing the instance-specific features for interaction. +Extensive experiments conducted on five challenging datasets demonstrate that +the proposed DAPT achieves superior performance compared to the full +fine-tuning counterparts while significantly reducing the trainable parameters +and training GPU memory by 95% and 35%, respectively. Code is available at +https://github.com/LMD0311/DAPT. + +
+
+ comment: Accepted to CVPR 2024. Code is available at + https://github.com/LMD0311/DAPT +
+
+
+
+
+ + ♻ ☆ CapsFusion: Rethinking Image-Text Data at Scale CVPR 2024 + + +
+ Large multimodal models demonstrate remarkable generalist ability to perform +diverse multimodal tasks in a zero-shot manner. Large-scale web-based +image-text pairs contribute fundamentally to this success, but suffer from +excessive noise. Recent studies use alternative captions synthesized by +captioning models and have achieved notable benchmark performance. However, our +experiments reveal significant Scalability Deficiency and World Knowledge Loss +issues in models trained with synthetic captions, which have been largely +obscured by their initial benchmark success. Upon closer examination, we +identify the root cause as the overly-simplified language structure and lack of +knowledge details in existing synthetic captions. To provide higher-quality and +more scalable multimodal pretraining data, we propose CapsFusion, an advanced +framework that leverages large language models to consolidate and refine +information from both web-based image-text pairs and synthetic captions. +Extensive experiments show that CapsFusion captions exhibit remarkable +all-round superiority over existing captions in terms of model performance +(e.g., 18.8 and 18.3 improvements in CIDEr score on COCO and NoCaps), sample +efficiency (requiring 11-16 times less computation than baselines), world +knowledge depth, and scalability. These effectiveness, efficiency and +scalability advantages position CapsFusion as a promising candidate for future +scaling of LMM training. + +
+
+ comment: CVPR 2024. Code & Dataset: https://github.com/baaivision/CapsFusion +
+
+
+
+
+ + ♻ ☆ Visual Program Distillation: Distilling Tools and Programmatic Reasoning + into Vision-Language Models CVPR 2024 + + +
+ Solving complex visual tasks such as "Who invented the musical instrument on +the right?" involves a composition of skills: understanding space, recognizing +instruments, and also retrieving prior knowledge. Recent work shows promise by +decomposing such tasks using a large language model (LLM) into an executable +program that invokes specialized vision models. However, generated programs are +error-prone: they omit necessary steps, include spurious ones, and are unable +to recover when the specialized models give incorrect outputs. Moreover, they +require loading multiple models, incurring high latency and computation costs. +We propose Visual Program Distillation (VPD), an instruction tuning framework +that produces a vision-language model (VLM) capable of solving complex visual +tasks with a single forward pass. VPD distills the reasoning ability of LLMs by +using them to sample multiple candidate programs, which are then executed and +verified to identify a correct one. It translates each correct program into a +language description of the reasoning steps, which are then distilled into a +VLM. Extensive experiments show that VPD improves the VLM's ability to count, +understand spatial relations, and reason compositionally. Our VPD-trained +PaLI-X outperforms all prior VLMs, achieving state-of-the-art performance +across complex vision tasks, including MMBench, OK-VQA, A-OKVQA, TallyQA, POPE, +and Hateful Memes. An evaluation with human annotators also confirms that VPD +improves model response factuality and consistency. Finally, experiments on +content moderation demonstrate that VPD is also helpful for adaptation to +real-world applications with limited data. + +
+
+ comment: CVPR 2024 Oral +
+
+
+
+
+ + ♻ ☆ Detecting Heart Disease from Multi-View Ultrasound Images via Supervised + Attention Multiple Instance Learning + + +
+ Aortic stenosis (AS) is a degenerative valve condition that causes +substantial morbidity and mortality. This condition is under-diagnosed and +under-treated. In clinical practice, AS is diagnosed with expert review of +transthoracic echocardiography, which produces dozens of ultrasound images of +the heart. Only some of these views show the aortic valve. To automate +screening for AS, deep networks must learn to mimic a human expert's ability to +identify views of the aortic valve then aggregate across these relevant images +to produce a study-level diagnosis. We find previous approaches to AS detection +yield insufficient accuracy due to relying on inflexible averages across +images. We further find that off-the-shelf attention-based multiple instance +learning (MIL) performs poorly. We contribute a new end-to-end MIL approach +with two key methodological innovations. First, a supervised attention +technique guides the learned attention mechanism to favor relevant views. +Second, a novel self-supervised pretraining strategy applies contrastive +learning on the representation of the whole study instead of individual images +as commonly done in prior literature. Experiments on an open-access dataset and +an external validation set show that our approach yields higher accuracy while +reducing model size. + +
+
+ comment: Echocardiogram; multiple-instance learning; self-supervised learning; + semi-supervised learning; medical imaging +
+
+
+
+
+ + ♻ ☆ FashionEngine: Interactive Generation and Editing of 3D Clothed Humans + + +
+ We present FashionEngine, an interactive 3D human generation and editing +system that allows us to design 3D digital humans in a way that aligns with how +humans interact with the world, such as natural languages, visual perceptions, +and hand-drawing. FashionEngine automates the 3D human production with three +key components: 1) A pre-trained 3D human diffusion model that learns to model +3D humans in a semantic UV latent space from 2D image training data, which +provides strong priors for diverse generation and editing tasks. 2) +Multimodality-UV Space encoding the texture appearance, shape topology, and +textual semantics of human clothing in a canonical UV-aligned space, which +faithfully aligns the user multimodal inputs with the implicit UV latent space +for controllable 3D human editing. The multimodality-UV space is shared across +different user inputs, such as texts, images, and sketches, which enables +various joint multimodal editing tasks. 3) Multimodality-UV Aligned Sampler +learns to sample high-quality and diverse 3D humans from the diffusion prior +for multimodal user inputs. Extensive experiments validate FashionEngine's +state-of-the-art performance for conditional generation/editing tasks. In +addition, we present an interactive user interface for our FashionEngine that +enables both conditional and unconditional generation tasks, and editing tasks +including pose/view/shape control, text-, image-, and sketch-driven 3D human +editing and 3D virtual try-on, in a unified framework. Our project page is at: +https://taohuumd.github.io/projects/FashionEngine. + +
+
+ comment: Project Page: https://taohuumd.github.io/projects/FashionEngine +
+
+
+
+
+ + ♻ ☆ WaterVG: Waterway Visual Grounding based on Text-Guided Vision and + mmWave Radar + + +
+ The perception of waterways based on human intent is significant for +autonomous navigation and operations of Unmanned Surface Vehicles (USVs) in +water environments. Inspired by visual grounding, we introduce WaterVG, the +first visual grounding dataset designed for USV-based waterway perception based +on human prompts. WaterVG encompasses prompts describing multiple targets, with +annotations at the instance level including bounding boxes and masks. Notably, +WaterVG includes 11,568 samples with 34,987 referred targets, whose prompts +integrates both visual and radar characteristics. The pattern of text-guided +two sensors equips a finer granularity of text prompts with visual and radar +features of referred targets. Moreover, we propose a low-power visual grounding +model, Potamoi, which is a multi-task model with a well-designed Phased +Heterogeneous Modality Fusion (PHMF) mode, including Adaptive Radar Weighting +(ARW) and Multi-Head Slim Cross Attention (MHSCA). Exactly, ARW extracts +required radar features to fuse with vision for prompt alignment. MHSCA is an +efficient fusion module with a remarkably small parameter count and FLOPs, +elegantly fusing scenario context captured by two sensors with linguistic +features, which performs expressively on visual grounding tasks. Comprehensive +experiments and evaluations have been conducted on WaterVG, where our Potamoi +archives state-of-the-art performances compared with counterparts. + +
+
+ comment: 10 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ RadarDistill: Boosting Radar-based Object Detection Performance via + Knowledge Distillation from LiDAR Features CVPR + + +
+ The inherent noisy and sparse characteristics of radar data pose challenges +in finding effective representations for 3D object detection. In this paper, we +propose RadarDistill, a novel knowledge distillation (KD) method, which can +improve the representation of radar data by leveraging LiDAR data. RadarDistill +successfully transfers desirable characteristics of LiDAR features into radar +features using three key components: Cross-Modality Alignment (CMA), +Activation-based Feature Distillation (AFD), and Proposal-based Feature +Distillation (PFD). CMA enhances the density of radar features by employing +multiple layers of dilation operations, effectively addressing the challenge of +inefficient knowledge transfer from LiDAR to radar. AFD selectively transfers +knowledge based on regions of the LiDAR features, with a specific focus on +areas where activation intensity exceeds a predefined threshold. PFD similarly +guides the radar network to selectively mimic features from the LiDAR network +within the object proposals. Our comparative analyses conducted on the nuScenes +datasets demonstrate that RadarDistill achieves state-of-the-art (SOTA) +performance for radar-only object detection task, recording 20.5% in mAP and +43.7% in NDS. Also, RadarDistill significantly improves the performance of the +camera-radar fusion model. + +
+
+ comment: Accepted to IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR) 2024, 10 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ 94% on CIFAR-10 in 3.29 Seconds on a Single GPU + + +
+ CIFAR-10 is among the most widely used datasets in machine learning, +facilitating thousands of research projects per year. To accelerate research +and reduce the cost of experiments, we introduce training methods for CIFAR-10 +which reach 94% accuracy in 3.29 seconds, 95% in 10.4 seconds, and 96% in 46.3 +seconds, when run on a single NVIDIA A100 GPU. As one factor contributing to +these training speeds, we propose a derandomized variant of horizontal flipping +augmentation, which we show improves over the standard method in every case +where flipping is beneficial over no flipping at all. Our code is released at +https://github.com/KellerJordan/cifar10-airbench. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 46 + +
+
+
+ + ☆ PreAfford: Universal Affordance-Based Pre-Grasping for Diverse Objects + and Environments + + +
+ Robotic manipulation of ungraspable objects with two-finger grippers presents +significant challenges due to the paucity of graspable features, while +traditional pre-grasping techniques, which rely on repositioning objects and +leveraging external aids like table edges, lack the adaptability across object +categories and scenes. Addressing this, we introduce PreAfford, a novel +pre-grasping planning framework that utilizes a point-level affordance +representation and a relay training approach to enhance adaptability across a +broad range of environments and object types, including those previously +unseen. Demonstrated on the ShapeNet-v2 dataset, PreAfford significantly +improves grasping success rates by 69% and validates its practicality through +real-world experiments. This work offers a robust and adaptable solution for +manipulating ungraspable objects. + +
+
+ comment: Project Page: https://air-discover.github.io/PreAfford/ +
+
+
+
+
+ + ☆ ROBUST: 221 Bugs in the Robot Operating System + + +
+ As robotic systems such as autonomous cars and delivery drones assume greater +roles and responsibilities within society, the likelihood and impact of +catastrophic software failure within those systems is increased.To aid +researchers in the development of new methods to measure and assure the safety +and quality of robotics software, we systematically curated a dataset of 221 +bugs across 7 popular and diverse software systems implemented via the Robot +Operating System (ROS). We produce historically accurate recreations of each of +the 221 defective software versions in the form of Docker images, and use a +grounded theory approach to examine and categorize their corresponding faults, +failures, and fixes. Finally, we reflect on the implications of our findings +and outline future research directions for the community. + +
+
+
+
+
+ + ☆ Anticipate & Collab: Data-driven Task Anticipation and Knowledge-driven + Planning for Human-robot Collaboration + + +
+ An agent assisting humans in daily living activities can collaborate more +effectively by anticipating upcoming tasks. Data-driven methods represent the +state of the art in task anticipation, planning, and related problems, but +these methods are resource-hungry and opaque. Our prior work introduced a proof +of concept framework that used an LLM to anticipate 3 high-level tasks that +served as goals for a classical planning system that computed a sequence of +low-level actions for the agent to achieve these goals. This paper describes +DaTAPlan, our framework that significantly extends our prior work toward +human-robot collaboration. Specifically, DaTAPlan planner computes actions for +an agent and a human to collaboratively and jointly achieve the tasks +anticipated by the LLM, and the agent automatically adapts to unexpected +changes in human action outcomes and preferences. We evaluate DaTAPlan +capabilities in a realistic simulation environment, demonstrating accurate task +anticipation, effective human-robot collaboration, and the ability to adapt to +unexpected changes. Project website: https://dataplan-hrc.github.io + +
+
+
+
+
+ + ☆ Embodied AI with Two Arms: Zero-shot Learning, Safety and Modularity + + +
+ We present an embodied AI system which receives open-ended natural language +instructions from a human, and controls two arms to collaboratively accomplish +potentially long-horizon tasks over a large workspace. Our system is modular: +it deploys state of the art Large Language Models for task +planning,Vision-Language models for semantic perception, and Point Cloud +transformers for grasping. With semantic and physical safety in mind, these +modules are interfaced with a real-time trajectory optimizer and a compliant +tracking controller to enable human-robot proximity. We demonstrate performance +for the following tasks: bi-arm sorting, bottle opening, and trash disposal +tasks. These are done zero-shot where the models used have not been trained +with any real world data from this bi-arm robot, scenes or workspace.Composing +both learning- and non-learning-based components in a modular fashion with +interpretable inputs and outputs allows the user to easily debug points of +failures and fragilities. One may also in-place swap modules to improve the +robustness of the overall platform, for instance with imitation-learned +policies. + +
+
+
+
+
+ + ☆ Factored Task and Motion Planning with Combined Optimization, Sampling + and Learning + + +
+ In this thesis, we aim to improve the performance of TAMP algorithms from +three complementary perspectives. First, we investigate the integration of +discrete task planning with continuous trajectory optimization. Our main +contribution is a conflict-based solver that automatically discovers why a task +plan might fail when considering the constraints of the physical world. This +information is then fed back into the task planner, resulting in an efficient, +bidirectional, and intuitive interface between task and motion, capable of +solving TAMP problems with multiple objects, robots, and tight physical +constraints. In the second part, we first illustrate that, given the wide range +of tasks and environments within TAMP, neither sampling nor optimization is +superior in all settings. To combine the strengths of both approaches, we have +designed meta-solvers for TAMP, adaptive solvers that automatically select +which algorithms and computations to use and how to best decompose each problem +to find a solution faster. In the third part, we combine deep learning +architectures with model-based reasoning to accelerate computations within our +TAMP solver. Specifically, we target infeasibility detection and nonlinear +optimization, focusing on generalization, accuracy, compute time, and data +efficiency. At the core of our contributions is a refined, factored +representation of the trajectory optimization problems inside TAMP. This +structure not only facilitates more efficient planning, encoding of geometric +infeasibility, and meta-reasoning but also provides better generalization in +neural architectures. + +
+
+ comment: PhD Thesis, TU Berlin +
+
+
+
+
+ + ☆ Robot Safety Monitoring using Programmable Light Curtains IROS '24 + + +
+ As factories continue to evolve into collaborative spaces with multiple +robots working together with human supervisors in the loop, ensuring safety for +all actors involved becomes critical. Currently, laser-based light curtain +sensors are widely used in factories for safety monitoring. While these +conventional safety sensors meet high accuracy standards, they are difficult to +reconfigure and can only monitor a fixed user-defined region of space. +Furthermore, they are typically expensive. Instead, we leverage a controllable +depth sensor, programmable light curtains (PLC), to develop an inexpensive and +flexible real-time safety monitoring system for collaborative robot workspaces. +Our system projects virtual dynamic safety envelopes that tightly envelop the +moving robot at all times and detect any objects that intrude the envelope. +Furthermore, we develop an instrumentation algorithm that optimally places +(multiple) PLCs in a workspace to maximize the visibility coverage of robots. +Our work enables fence-less human-robot collaboration, while scaling to monitor +multiple robots with few sensors. We analyze our system in a real manufacturing +testbed with four robot arms and demonstrate its capabilities as a fast, +accurate, and inexpensive safety monitoring solution. + +
+
+ comment: Under review for IROS '24. Webpage + http://cmu-mfi.github.io/plc-safety +
+
+
+
+
+ + ☆ Integrating Large Language Models with Multimodal Virtual Reality + Interfaces to Support Collaborative Human-Robot Construction Work + + +
+ In the construction industry, where work environments are complex, +unstructured and often dangerous, the implementation of Human-Robot +Collaboration (HRC) is emerging as a promising advancement. This underlines the +critical need for intuitive communication interfaces that enable construction +workers to collaborate seamlessly with robotic assistants. This study +introduces a conversational Virtual Reality (VR) interface integrating +multimodal interaction to enhance intuitive communication between construction +workers and robots. By integrating voice and controller inputs with the Robot +Operating System (ROS), Building Information Modeling (BIM), and a game engine +featuring a chat interface powered by a Large Language Model (LLM), the +proposed system enables intuitive and precise interaction within a VR setting. +Evaluated by twelve construction workers through a drywall installation case +study, the proposed system demonstrated its low workload and high usability +with succinct command inputs. The proposed multimodal interaction system +suggests that such technological integration can substantially advance the +integration of robotic assistants in the construction industry. + +
+
+ comment: 39 pages, 16 figures, 5 tables +
+
+
+
+
+ + ☆ A Methodology to Study the Impact of Spiking Neural Network Parameters + considering Event-Based Automotive Data + + +
+ Autonomous Driving (AD) systems are considered as the future of human +mobility and transportation. Solving computer vision tasks such as image +classification and object detection/segmentation, with high accuracy and low +power/energy consumption, is highly needed to realize AD systems in real life. +These requirements can potentially be satisfied by Spiking Neural Networks +(SNNs). However, the state-of-the-art works in SNN-based AD systems still focus +on proposing network models that can achieve high accuracy, and they have not +systematically studied the roles of SNN parameters when used for learning +event-based automotive data. Therefore, we still lack understanding of how to +effectively develop SNN models for AD systems. Toward this, we propose a novel +methodology to systematically study and analyze the impact of SNN parameters +considering event-based automotive data, then leverage this analysis for +enhancing SNN developments. To do this, we first explore different settings of +SNN parameters that directly affect the learning mechanism (i.e., batch size, +learning rate, neuron threshold potential, and weight decay), then analyze the +accuracy results. Afterward, we propose techniques that jointly improve SNN +accuracy and reduce training time. Experimental results show that our +methodology can improve the SNN models for AD systems than the +state-of-the-art, as it achieves higher accuracy (i.e., 86%) for the NCARS +dataset, and it can also achieve iso-accuracy (i.e., ~85% with standard +deviation less than 0.5%) while speeding up the training time by 1.9x. In this +manner, our research work provides a set of guidelines for SNN parameter +enhancements, thereby enabling the practical developments of SNN-based AD +systems. + +
+
+ comment: 7 pages, 13 figures, 1 table +
+
+
+
+
+ + ☆ Design of Stickbug: a Six-Armed Precision Pollination Robot + + +
+ This work presents the design of Stickbug, a six-armed, multi-agent, +precision pollination robot that combines the accuracy of single-agent systems +with swarm parallelization in greenhouses. Precision pollination robots have +often been proposed to offset the effects of a decreasing population of natural +pollinators, but they frequently lack the required parallelization and +scalability. Stickbug achieves this by allowing each arm and drive base to act +as an individual agent, significantly reducing planning complexity. Stickbug +uses a compact holonomic Kiwi drive to navigate narrow greenhouse rows, a tall +mast to support multiple manipulators and reach plant heights, a detection +model and classifier to identify Bramble flowers, and a felt-tipped +end-effector for contact-based pollination. Initial experimental validation +demonstrates that Stickbug can attempt over 1.5 pollinations per minute with a +50% success rate. Additionally, a Bramble flower perception dataset was created +and is publicly available alongside Stickbug's software and design files. + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ☆ You Only Scan Once: A Dynamic Scene Reconstruction Pipeline for 6-DoF + Robotic Grasping of Novel Objects ICRA 2024 + + +
+ In the realm of robotic grasping, achieving accurate and reliable +interactions with the environment is a pivotal challenge. Traditional methods +of grasp planning methods utilizing partial point clouds derived from depth +image often suffer from reduced scene understanding due to occlusion, +ultimately impeding their grasping accuracy. Furthermore, scene reconstruction +methods have primarily relied upon static techniques, which are susceptible to +environment change during manipulation process limits their efficacy in +real-time grasping tasks. To address these limitations, this paper introduces a +novel two-stage pipeline for dynamic scene reconstruction. In the first stage, +our approach takes scene scanning as input to register each target object with +mesh reconstruction and novel object pose tracking. In the second stage, pose +tracking is still performed to provide object poses in real-time, enabling our +approach to transform the reconstructed object point clouds back into the +scene. Unlike conventional methodologies, which rely on static scene snapshots, +our method continuously captures the evolving scene geometry, resulting in a +comprehensive and up-to-date point cloud representation. By circumventing the +constraints posed by occlusion, our method enhances the overall grasp planning +process and empowers state-of-the-art 6-DoF robotic grasping algorithms to +exhibit markedly improved accuracy. + +
+
+ comment: ICRA 2024 +
+
+
+
+
+ + ☆ Simultaneous State Estimation and Contact Detection for Legged Robots by + Multiple-Model Kalman Filtering + + +
+ This paper proposes an algorithm for combined contact detection and state +estimation for legged robots. The proposed algorithm models the robot's +movement as a switched system, in which different modes relate to different +feet being in contact with the ground. The key element in the proposed +algorithm is an interacting multiple-model Kalman filter, which identifies the +currently-active mode defining contacts, while estimating the state. The +rationale for the proposed estimation framework is that contacts (and contact +forces) impact the robot's state and vice versa. This paper presents validation +studies with a quadruped using (i) the high-fidelity simulator Gazebo for a +comparison with ground truth values and a baseline estimator, and (ii) hardware +experiments with the Unitree A1 robot. The simulation study shows that the +proposed algorithm outperforms the baseline estimator, which does not +simultaneous detect contacts. The hardware experiments showcase the +applicability of the proposed algorithm and highlights the ability to detect +contacts. + +
+
+
+
+
+ + ☆ GMMCalib: Extrinsic Calibration of LiDAR Sensors using GMM-based Joint + Registration + + +
+ State-of-the-art LiDAR calibration frameworks mainly use non-probabilistic +registration methods such as Iterative Closest Point (ICP) and its variants. +These methods suffer from biased results due to their pair-wise registration +procedure as well as their sensitivity to initialization and parameterization. +This often leads to misalignments in the calibration process. Probabilistic +registration methods compensate for these drawbacks by specifically modeling +the probabilistic nature of the observations. This paper presents GMMCalib, an +automatic target-based extrinsic calibration approach for multi-LiDAR systems. +Using an implementation of a Gaussian Mixture Model (GMM)-based registration +method that allows joint registration of multiple point clouds, this +data-driven approach is compared to ICP algorithms. We perform simulation +experiments using the digital twin of the EDGAR research vehicle and validate +the results in a real-world environment. We also address the local minima +problem of local registration methods for extrinsic sensor calibration and use +a distance-based metric to evaluate the calibration results. Our results show +that an increase in robustness against sensor miscalibrations can be achieved +by using GMM-based registration algorithms. The code is open source and +available on GitHub. + +
+
+
+
+
+ + ☆ Future Predictive Success-or-Failure Classification for Long-Horizon + Robotic Tasks + + +
+ Automating long-horizon tasks with a robotic arm has been a central research +topic in robotics. Optimization-based action planning is an efficient approach +for creating an action plan to complete a given task. Construction of a +reliable planning method requires a design process of conditions, e.g., to +avoid collision between objects. The design process, however, has two critical +issues: 1) iterative trials--the design process is time-consuming due to the +trial-and-error process of modifying conditions, and 2) manual redesign--it is +difficult to cover all the necessary conditions manually. To tackle these +issues, this paper proposes a future-predictive +success-or-failure-classification method to obtain conditions automatically. +The key idea behind the proposed method is an end-to-end approach for +determining whether the action plan can complete a given task instead of +manually redesigning the conditions. The proposed method uses a long-horizon +future-prediction method to enable success-or-failure classification without +the execution of an action plan. This paper also proposes a regularization term +called transition consistency regularization to provide easy-to-predict feature +distribution. The regularization term improves future prediction and +classification performance. The effectiveness of our method is demonstrated +through classification and robotic-manipulation experiments. + +
+
+ comment: IJCNN 2024 +
+
+
+
+
+ + ☆ RADIUM: Predicting and Repairing End-to-End Robot Failures using + Gradient-Accelerated Sampling + + +
+ Before autonomous systems can be deployed in safety-critical applications, we +must be able to understand and verify the safety of these systems. For cases +where the risk or cost of real-world testing is prohibitive, we propose a +simulation-based framework for a) predicting ways in which an autonomous system +is likely to fail and b) automatically adjusting the system's design and +control policy to preemptively mitigate those failures. Existing tools for +failure prediction struggle to search over high-dimensional environmental +parameters, cannot efficiently handle end-to-end testing for systems with +vision in the loop, and provide little guidance on how to mitigate failures +once they are discovered. We approach this problem through the lens of +approximate Bayesian inference and use differentiable simulation and rendering +for efficient failure case prediction and repair. For cases where a +differentiable simulator is not available, we provide a gradient-free version +of our algorithm, and we include a theoretical and empirical evaluation of the +trade-offs between gradient-based and gradient-free methods. We apply our +approach on a range of robotics and control problems, including optimizing +search patterns for robot swarms, UAV formation control, and robust network +control. Compared to optimization-based falsification methods, our method +predicts a more diverse, representative set of failure modes, and we find that +our use of differentiable simulation yields solutions that have up to 10x lower +cost and requires up to 2x fewer iterations to converge relative to +gradient-free techniques. In hardware experiments, we find that repairing +control policies using our method leads to a 5x robustness improvement. +Accompanying code and video can be found at https://mit-realm.github.io/radium/ + +
+
+
+
+
+ + ☆ SENSOR: Imitate Third-Person Expert's Behaviors via Active Sensoring + + +
+ In many real-world visual Imitation Learning (IL) scenarios, there is a +misalignment between the agent's and the expert's perspectives, which might +lead to the failure of imitation. Previous methods have generally solved this +problem by domain alignment, which incurs extra computation and storage costs, +and these methods fail to handle the \textit{hard cases} where the viewpoint +gap is too large. To alleviate the above problems, we introduce active +sensoring in the visual IL setting and propose a model-based SENSory imitatOR +(SENSOR) to automatically change the agent's perspective to match the expert's. +SENSOR jointly learns a world model to capture the dynamics of latent states, a +sensor policy to control the camera, and a motor policy to control the agent. +Experiments on visual locomotion tasks show that SENSOR can efficiently +simulate the expert's perspective and strategy, and outperforms most baseline +methods. + +
+
+
+
+
+ + ☆ Space Physiology and Technology: Musculoskeletal Adaptations, + Countermeasures, and the Opportunity for Wearable Robotics + + +
+ Space poses significant challenges for human physiology, leading to +physiological adaptations in response to an environment vastly different from +Earth. While these adaptations can be beneficial, they may not fully counteract +the adverse impact of space-related stressors. A comprehensive understanding of +these physiological adaptations is needed to devise effective countermeasures +to support human life in space. This review focuses on the impact of the +environment in space on the musculoskeletal system. It highlights the complex +interplay between bone and muscle adaptation, the underlying physiological +mechanisms, and their implications on astronaut health. Furthermore, the review +delves into the deployed and current advances in countermeasures and proposes, +as a perspective for future developments, wearable sensing and robotic +technologies, such as exoskeletons, as a fitting alternative. + +
+
+ comment: 23 pages (including references), 8 figures and 318 references +
+
+
+
+
+ + ☆ Scaling Population-Based Reinforcement Learning with GPU Accelerated + Simulation RA-L + + +
+ In recent years, deep reinforcement learning (RL) has shown its effectiveness +in solving complex continuous control tasks like locomotion and dexterous +manipulation. However, this comes at the cost of an enormous amount of +experience required for training, exacerbated by the sensitivity of learning +efficiency and the policy performance to hyperparameter selection, which often +requires numerous trials of time-consuming experiments. This work introduces a +Population-Based Reinforcement Learning (PBRL) approach that exploits a +GPU-accelerated physics simulator to enhance the exploration capabilities of RL +by concurrently training multiple policies in parallel. The PBRL framework is +applied to three state-of-the-art RL algorithms -- PPO, SAC, and DDPG -- +dynamically adjusting hyperparameters based on the performance of learning +agents. The experiments are performed on four challenging tasks in Isaac Gym -- +Anymal Terrain, Shadow Hand, Humanoid, Franka Nut Pick -- by analyzing the +effect of population size and mutation mechanisms for hyperparameters. The +results show that PBRL agents achieve superior performance, in terms of +cumulative reward, compared to non-evolutionary baseline agents. The trained +agents are finally deployed in the real world for a Franka Nut Pick} task, +demonstrating successful sim-to-real transfer. Code and videos of the learned +policies are available on our project website. + +
+
+ comment: Submitted for publication to IEEE Robotics and Automation Letters + (RA-L) +
+
+
+
+
+ + ☆ Embodied Neuromorphic Artificial Intelligence for Robotics: + Perspectives, Challenges, and Research Development Stack + + +
+ Robotic technologies have been an indispensable part for improving human +productivity since they have been helping humans in completing diverse, +complex, and intensive tasks in a fast yet accurate and efficient way. +Therefore, robotic technologies have been deployed in a wide range of +applications, ranging from personal to industrial use-cases. However, current +robotic technologies and their computing paradigm still lack embodied +intelligence to efficiently interact with operational environments, respond +with correct/expected actions, and adapt to changes in the environments. Toward +this, recent advances in neuromorphic computing with Spiking Neural Networks +(SNN) have demonstrated the potential to enable the embodied intelligence for +robotics through bio-plausible computing paradigm that mimics how the +biological brain works, known as "neuromorphic artificial intelligence (AI)". +However, the field of neuromorphic AI-based robotics is still at an early +stage, therefore its development and deployment for solving real-world problems +expose new challenges in different design aspects, such as accuracy, +adaptability, efficiency, reliability, and security. To address these +challenges, this paper will discuss how we can enable embodied neuromorphic AI +for robotic systems through our perspectives: (P1) Embodied intelligence based +on effective learning rule, training mechanism, and adaptability; (P2) +Cross-layer optimizations for energy-efficient neuromorphic computing; (P3) +Representative and fair benchmarks; (P4) Low-cost reliability and safety +enhancements; (P5) Security and privacy for neuromorphic computing; and (P6) A +synergistic development for energy-efficient and robust neuromorphic-based +robotics. Furthermore, this paper identifies research challenges and +opportunities, as well as elaborates our vision for future research development +toward embodied neuromorphic AI for robotics. + +
+
+ comment: 8 pages, 9 figures, 1 table +
+
+
+
+
+ + ☆ Bi-level Trajectory Optimization on Uneven Terrains with Differentiable + Wheel-Terrain Interaction Model IROS 2024 + + +
+ Navigation of wheeled vehicles on uneven terrain necessitates going beyond +the 2D approaches for trajectory planning. Specifically, it is essential to +incorporate the full 6dof variation of vehicle pose and its associated +stability cost in the planning process. To this end, most recent works aim to +learn a neural network model to predict the vehicle evolution. However, such +approaches are data-intensive and fraught with generalization issues. In this +paper, we present a purely model-based approach that just requires the digital +elevation information of the terrain. Specifically, we express the +wheel-terrain interaction and 6dof pose prediction as a non-linear least +squares (NLS) problem. As a result, trajectory planning can be viewed as a +bi-level optimization. The inner optimization layer predicts the pose on the +terrain along a given trajectory, while the outer layer deforms the trajectory +itself to reduce the stability and kinematic costs of the pose. We improve the +state-of-the-art in the following respects. First, we show that our NLS based +pose prediction closely matches the output from a high-fidelity physics engine. +This result coupled with the fact that we can query gradients of the NLS +solver, makes our pose predictor, a differentiable wheel-terrain interaction +model. We further leverage this differentiability to efficiently solve the +proposed bi-level trajectory optimization problem. Finally, we perform +extensive experiments, and comparison with a baseline to showcase the +effectiveness of our approach in obtaining smooth, stable trajectories. + +
+
+ comment: 8 pages, 7 figures, submitted to IEEE/RSJ International Conference on + Intelligent Robots and Systems (IROS 2024) +
+
+
+
+
+ + ☆ DELTA: Decomposed Efficient Long-Term Robot Task Planning using Large + Language Models + + +
+ Recent advancements in Large Language Models (LLMs) have sparked a revolution +across various research fields. In particular, the integration of common-sense +knowledge from LLMs into robot task and motion planning has been proven to be a +game-changer, elevating performance in terms of explainability and downstream +task efficiency to unprecedented heights. However, managing the vast knowledge +encapsulated within these large models has posed challenges, often resulting in +infeasible plans generated by LLM-based planning systems due to hallucinations +or missing domain information. To overcome these challenges and obtain even +greater planning feasibility and computational efficiency, we propose a novel +LLM-driven task planning approach called DELTA. For achieving better grounding +from environmental topology into actionable knowledge, DELTA leverages the +power of scene graphs as environment representations within LLMs, enabling the +fast generation of precise planning problem descriptions. For obtaining higher +planning performance, we use LLMs to decompose the long-term task goals into an +autoregressive sequence of sub-goals for an automated task planner to solve. +Our contribution enables a more efficient and fully automatic task planning +pipeline, achieving higher planning success rates and significantly shorter +planning times compared to the state of the art. + +
+
+
+
+
+ + ☆ Traversability-aware Adaptive Optimization for Path Planning and Control + in Mountainous Terrain RA-L + + +
+ Autonomous navigation in extreme mountainous terrains poses challenges due to +the presence of mobility-stressing elements and undulating surfaces, making it +particularly difficult compared to conventional off-road driving scenarios. In +such environments, estimating traversability solely based on exteroceptive +sensors often leads to the inability to reach the goal due to a high prevalence +of non-traversable areas. In this paper, we consider traversability as a +relative value that integrates the robot's internal state, such as speed and +torque to exhibit resilient behavior to reach its goal successfully. We +separate traversability into apparent traversability and relative +traversability, then incorporate these distinctions in the optimization process +of sampling-based planning and motion predictive control. Our method enables +the robots to execute the desired behaviors more accurately while avoiding +hazardous regions and getting stuck. Experiments conducted on simulation with +27 diverse types of mountainous terrain and real-world demonstrate the +robustness of the proposed framework, with increasingly better performance +observed in more complex environments. + +
+
+ comment: 8 pages, 7 figures, accepted 2024 RA-L +
+
+
+
+
+ + ☆ Real-time Noise Source Estimation of a Camera System from an Image and + Metadata + + +
+ Autonomous machines must self-maintain proper functionality to ensure the +safety of humans and themselves. This pertains particularly to its cameras as +predominant sensors to perceive the environment and support actions. A +fundamental camera problem addressed in this study is noise. Solutions often +focus on denoising images a posteriori, that is, fighting symptoms rather than +root causes. However, tackling root causes requires identifying the noise +sources, considering the limitations of mobile platforms. This work +investigates a real-time, memory-efficient and reliable noise source estimator +that combines data- and physically-based models. To this end, a DNN that +examines an image with camera metadata for major camera noise sources is built +and trained. In addition, it quantifies unexpected factors that impact image +noise or metadata. This study investigates seven different estimators on six +datasets that include synthetic noise, real-world noise from two camera +systems, and real field campaigns. For these, only the model with most metadata +is capable to accurately and robustly quantify all individual noise +contributions. This method outperforms total image noise estimators and can be +plug-and-play deployed. It also serves as a basis to include more advanced +noise sources, or as part of an automatic countermeasure feedback-loop to +approach fully reliable machines. + +
+
+ comment: 16 pages, 16 figures, 12 tables, Project page: + https://github.com/MaikWischow/Noise-Source-Estimation +
+
+
+
+
+ + ☆ Adaptive Discrete Disparity Volume for Self-supervised Monocular Depth + Estimation + + +
+ In self-supervised monocular depth estimation tasks, discrete disparity +prediction has been proven to attain higher quality depth maps than common +continuous methods. However, current discretization strategies often divide +depth ranges of scenes into bins in a handcrafted and rigid manner, limiting +model performance. In this paper, we propose a learnable module, Adaptive +Discrete Disparity Volume (ADDV), which is capable of dynamically sensing depth +distributions in different RGB images and generating adaptive bins for them. +Without any extra supervision, this module can be integrated into existing CNN +architectures, allowing networks to produce representative values for bins and +a probability volume over them. Furthermore, we introduce novel training +strategies - uniformizing and sharpening - through a loss term and temperature +parameter, respectively, to provide regularizations under self-supervised +conditions, preventing model degradation or collapse. Empirical results +demonstrate that ADDV effectively processes global information, generating +appropriate bins for various scenes and producing higher quality depth maps +compared to handcrafted methods. + +
+
+
+
+
+ + ☆ RAnGE: Reachability Analysis for Guaranteed Ergodicity + + +
+ This paper investigates performance guarantees on coverage-based ergodic +exploration methods in environments containing disturbances. Ergodic +exploration methods generate trajectories for autonomous robots such that time +spent in an area is proportional to the utility of exploring in the area. +However, providing formal performance guarantees for ergodic exploration +methods is still an open challenge due to the complexities in the problem +formulation. In this work, we propose to formulate ergodic search as a +differential game, in which a controller and external disturbance force seek to +minimize and maximize the ergodic metric, respectively. Through an +extended-state Bolza-form transform of the ergodic problem, we demonstrate it +is possible to use techniques from reachability analysis to solve for optimal +controllers that guarantee coverage and are robust against disturbances. Our +approach leverages neural-network based methods to obtain approximate value +function solutions for reachability problems that mitigate the increased +computational scaling due to the extended state. As a result, we are able to +compute continuous value functions for the ergodic exploration problem and +provide performance guarantees for coverage under disturbances. Simulated and +experimental results demonstrate the efficacy of our approach to generate +robust ergodic trajectories for search and exploration with external +disturbance force. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Design and Evaluation of a Compact 3D End-effector Assistive Robot for + Adaptive Arm Support + + +
+ We developed a 3D end-effector type of upper limb assistive robot, named as +Assistive Robotic Arm Extender (ARAE), that provides transparency movement and +adaptive arm support control to achieve home-based therapy and training in the +real environment. The proposed system composes five degrees of freedom, +including three active motors and two passive joints at the end-effector +module. The core structure of the system is based on a parallel mechanism. The +kinematic and dynamic modeling are illustrated in detail. The proposed adaptive +arm support control framework calculates the compensated force based on the +estimated human arm posture in 3D space. It firstly estimates human arm joint +angles using two proposed methods: fixed torso and sagittal plane models +without using external sensors such as IMUs, magnetic sensors, or depth +cameras. The experiments were carried out to evaluate the performance of the +two proposed angle estimation methods. Then, the estimated human joint angles +were input into the human upper limb dynamics model to derive the required +support force generated by the robot. The muscular activities were measured to +evaluate the effects of the proposed framework. The obvious reduction of +muscular activities was exhibited when participants were tested with the ARAE +under an adaptive arm gravity compensation control framework. The overall +results suggest that the ARAE system, when combined with the proposed control +framework, has the potential to offer adaptive arm support. This integration +could enable effective training with Activities of Daily Living (ADLs) and +interaction with real environments. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ A Framework for Guided Motion Planning + + +
+ Randomized sampling based algorithms are widely used in robot motion planning +due to the problem's intractability, and are experimentally effective on a wide +range of problem instances. Most variants bias their sampling using various +heuristics related to the known underlying structure of the search space. In +this work, we formalize the intuitive notion of guided search by defining the +concept of a guiding space. This new language encapsulates many seemingly +distinct prior methods under the same framework, and allows us to reason about +guidance, a previously obscured core contribution of different algorithms. We +suggest an information theoretic method to evaluate guidance, which +experimentally matches intuition when tested on known algorithms in a variety +of environments. The language and evaluation of guidance suggests improvements +to existing methods, and allows for simple hybrid algorithms that combine +guidance from multiple sources. + +
+
+
+
+
+ + ☆ Fast k-connectivity Restoration in Multi-Robot Systems for Robust + Communication Maintenance + + +
+ Maintaining a robust communication network plays an important role in the +success of a multi-robot team jointly performing an optimization task. A key +characteristic of a robust cooperative multi-robot system is the ability to +repair the communication topology in the case of robot failure. In this paper, +we focus on the Fast k-connectivity Restoration (FCR) problem, which aims to +repair a network to make it k-connected with minimum robot movement. We develop +a Quadratically Constrained Program (QCP) formulation of the FCR problem, which +provides a way to optimally solve the problem, but cannot handle large +instances due to high computational overhead. We therefore present a scalable +algorithm, called EA-SCR, for the FCR problem using graph theoretic concepts. +By conducting empirical studies, we demonstrate that the EA-SCR algorithm +performs within 10 percent of the optimal while being orders of magnitude +faster. We also show that EA-SCR outperforms existing solutions by 30 percent +in terms of the FCR distance metric. + +
+
+ comment: 17 pages, 6 figures, 3 algorithms. arXiv admin note: text overlap + with arXiv:2011.00685 +
+
+
+
+
+ + ☆ Accounting for Hysteresis in the Forward Kinematics of + Nonlinearly-Routed Tendon-Driven Continuum Robots via a Learned Deep Decoder + Network + + +
+ Tendon-driven continuum robots have been gaining popularity in medical +applications due to their ability to curve around complex anatomical +structures, potentially reducing the invasiveness of surgery. However, accurate +modeling is required to plan and control the movements of these flexible +robots. Physics-based models have limitations due to unmodeled effects, leading +to mismatches between model prediction and actual robot shape. Recently +proposed learning-based methods have been shown to overcome some of these +limitations but do not account for hysteresis, a significant source of error +for these robots. To overcome these challenges, we propose a novel deep decoder +neural network that predicts the complete shape of tendon-driven robots using +point clouds as the shape representation, conditioned on prior configurations +to account for hysteresis. We evaluate our method on a physical tendon-driven +robot and show that our network model accurately predicts the robot's shape, +significantly outperforming a state-of-the-art physics-based model and a +learning-based model that does not account for hysteresis. + +
+
+ comment: 8 pages, 9 figures, Submitted to IEEE Robotics and Automation Letters +
+
+
+
+
+ + ☆ A Bimanual Teleoperation Framework for Light Duty Underwater + Vehicle-Manipulator Systems + + +
+ In an effort to lower the barrier to entry in underwater manipulation, this +paper presents an open-source, user-friendly framework for bimanual +teleoperation of a light-duty underwater vehicle-manipulator system (UVMS). +This framework allows for the control of the vehicle along with two +manipulators and their end-effectors using two low-cost haptic devices. + The UVMS kinematics are derived in order to create an independent resolved +motion rate controller for each manipulator, which optimally controls the joint +positions to achieve a desired end-effector pose. This desired pose is computed +in real-time using a teleoperation controller developed to process the dual +haptic device input from the user. A physics-based simulation environment is +used to implement this framework for two example tasks as well as provide data +for error analysis of user commands. The first task illustrates the +functionality of the framework through motion control of the vehicle and +manipulators using only the haptic devices. The second task is to grasp an +object using both manipulators simultaneously, demonstrating precision and +coordination using the framework. The framework code is available at +https://github.com/stevens-armlab/uvms_bimanual_sim. + +
+
+ comment: 8 pages, 21st International Conference on Ubiquitous Robots (UR + 2024), accepted +
+
+
+
+
+ + ☆ A High-Fidelity Simulation Framework for Grasping Stability Analysis in + Human Casualty Manipulation RA-L + + +
+ Recently, there has been a growing interest in rescue robots due to their +vital role in addressing emergency scenarios and providing crucial support in +challenging or hazardous situations where human intervention is difficult. +However, very few of these robots are capable of actively engaging with humans +and undertaking physical manipulation tasks. This limitation is largely +attributed to the absence of tools that can realistically simulate physical +interactions, especially the contact mechanisms between a robotic gripper and a +human body. In this letter, we aim to address key limitations in current +developments towards robotic casualty manipulation. Firstly, we present an +integrative simulation framework for casualty manipulation. We adapt a finite +element method (FEM) tool into the grasping and manipulation scenario, and the +developed framework can provide accurate biomechanical reactions resulting from +manipulation. Secondly, we conduct a detailed assessment of grasping stability +during casualty grasping and manipulation simulations. To validate the +necessity and superior performance of the proposed high-fidelity simulation +framework, we conducted a qualitative and quantitative comparison of grasping +stability analyses between the proposed framework and the state-of-the-art +multi-body physics simulations. Through these efforts, we have taken the first +step towards a feasible solution for robotic casualty manipulation. + +
+
+ comment: 8 pages, revision submitted to IEEE RA-L, under review +
+
+
+
+
+ + ☆ Legible and Proactive Robot Planning for Prosocial Human-Robot + Interactions + + +
+ Humans have a remarkable ability to fluently engage in joint collision +avoidance in crowded navigation tasks despite the complexities and +uncertainties inherent in human behavior. Underlying these interactions is a +mutual understanding that (i) individuals are prosocial, that is, there is +equitable responsibility in avoiding collisions, and (ii) individuals should +behave legibly, that is, move in a way that clearly conveys their intent to +reduce ambiguity in how they intend to avoid others. Toward building robots +that can safely and seamlessly interact with humans, we propose a general robot +trajectory planning framework for synthesizing legible and proactive behaviors +and demonstrate that our robot planner naturally leads to prosocial +interactions. Specifically, we introduce the notion of a markup factor to +incentivize legible and proactive behaviors and an inconvenience budget +constraint to ensure equitable collision avoidance responsibility. We evaluate +our approach against well-established multi-agent planning algorithms and show +that using our approach produces safe, fluent, and prosocial interactions. We +demonstrate the real-time feasibility of our approach with human-in-the-loop +simulations. Project page can be found at https://uw-ctrl.github.io/phri/. + +
+
+ comment: Accepted to IEEE International Conference on Robotics and Automation + 2024 +
+
+
+
+
+ + ☆ JUICER: Data-Efficient Imitation Learning for Robotic Assembly + + +
+ While learning from demonstrations is powerful for acquiring visuomotor +policies, high-performance imitation without large demonstration datasets +remains challenging for tasks requiring precise, long-horizon manipulation. +This paper proposes a pipeline for improving imitation learning performance +with a small human demonstration budget. We apply our approach to assembly +tasks that require precisely grasping, reorienting, and inserting multiple +parts over long horizons and multiple task phases. Our pipeline combines +expressive policy architectures and various techniques for dataset expansion +and simulation-based data augmentation. These help expand dataset support and +supervise the model with locally corrective actions near bottleneck regions +requiring high precision. We demonstrate our pipeline on four furniture +assembly tasks in simulation, enabling a manipulator to assemble up to five +parts over nearly 2500 time steps directly from RGB images, outperforming +imitation and data augmentation baselines. + +
+
+
+
+
+ + ♻ ☆ Forming Large Patterns with Local Robots in the OBLOT Model + + +
+ In the arbitrary pattern formation problem, $n$ autonomous, mobile robots +must form an arbitrary pattern $P \subseteq \mathbb{R}^2$. The (deterministic) +robots are typically assumed to be indistinguishable, disoriented, and unable +to communicate. An important distinction is whether robots have memory and/or a +limited viewing range. Previous work managed to form $P$ under a natural +symmetry condition if robots have no memory but an unlimited viewing range [22] +or if robots have a limited viewing range but memory [25]. In the latter case, +$P$ is only formed in a shrunk version that has constant diameter. + Without memory and with limited viewing range, forming arbitrary patterns +remains an open problem. We provide a partial solution by showing that $P$ can +be formed under the same symmetry condition if the robots' initial diameter is +$\leq 1$. Our protocol partitions $P$ into rotation-symmetric components and +exploits the initial mutual visibility to form one cluster per component. Using +a careful placement of the clusters and their robots, we show that a cluster +can move in a coordinated way through its component while drawing $P$ by +dropping one robot per pattern coordinate. + +
+
+ comment: 24 pages, 3 figures, submitted for SAND 2024, version with extended + appendix +
+
+
+
+
+ + ♻ ☆ Enhancing Social Decision-Making of Autonomous Vehicles: A + Mixed-Strategy Game Approach With Interaction Orientation Identification + + +
+ The integration of Autonomous Vehicles (AVs) into existing human-driven +traffic systems poses considerable challenges, especially within environments +where human and machine interactions are frequent and complex, such as at +unsignalized intersections. To deal with these challenges, we introduce a novel +framework predicated on dynamic and socially-aware decision-making game theory +to augment the social decision-making prowess of AVs in mixed driving +environments. This comprehensive framework is delineated into three primary +modules: Interaction Orientation Identification, Mixed-Strategy Game Modeling, +and Expert Mode Learning. We introduce 'Interaction Orientation' as a metric to +evaluate the social decision-making tendencies of various agents, incorporating +both environmental factors and trajectory characteristics. The mixed-strategy +game model developed as part of this framework considers the evolution of +future traffic scenarios and includes a utility function that balances safety, +operational efficiency, and the unpredictability of environmental conditions. +To adapt to real-world driving complexities, our framework utilizes a dynamic +optimization framework for assimilating and learning from expert human driving +strategies. These strategies are compiled into a comprehensive strategy +library, serving as a reference for future decision-making processes. The +proposed approach is validated through extensive driving datasets and +human-in-loop driving experiments, and the results demonstrate marked +enhancements in decision timing and precision. + +
+
+
+
+
+ + ♻ ☆ Self-Aligning Depth-regularized Radiance Fields for Asynchronous RGB-D + Sequences + + +
+ It has been shown that learning radiance fields with depth rendering and +depth supervision can effectively promote the quality and convergence of view +synthesis. However, this paradigm requires input RGB-D sequences to be +synchronized, hindering its usage in the UAV city modeling scenario. As there +exists asynchrony between RGB images and depth images due to high-speed flight, +we propose a novel time-pose function, which is an implicit network that maps +timestamps to $\rm SE(3)$ elements. To simplify the training process, we also +design a joint optimization scheme to jointly learn the large-scale +depth-regularized radiance fields and the time-pose function. Our algorithm +consists of three steps: (1) time-pose function fitting, (2) radiance field +bootstrapping, (3) joint pose error compensation and radiance field refinement. +In addition, we propose a large synthetic dataset with diverse controlled +mismatches and ground truth to evaluate this new problem setting +systematically. Through extensive experiments, we demonstrate that our method +outperforms baselines without regularization. We also show qualitatively +improved results on a real-world asynchronous RGB-D sequence captured by drone. +Codes, data, and models will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ CARLOS: An Open, Modular, and Scalable Simulation Framework for the + Development and Testing of Software for C-ITS + + +
+ Future mobility systems and their components are increasingly defined by +their software. The complexity of these cooperative intelligent transport +systems (C-ITS) and the everchanging requirements posed at the software require +continual software updates. The dynamic nature of the system and the +practically innumerable scenarios in which different software components work +together necessitate efficient and automated development and testing procedures +that use simulations as one core methodology. The availability of such +simulation architectures is a common interest among many stakeholders, +especially in the field of automated driving. That is why we propose CARLOS - +an open, modular, and scalable simulation framework for the development and +testing of software in C-ITS that leverages the rich CARLA and ROS ecosystems. +We provide core building blocks for this framework and explain how it can be +used and extended by the community. Its architecture builds upon modern +microservice and DevOps principles such as containerization and continuous +integration. In our paper, we motivate the architecture by describing important +design principles and showcasing three major use cases - software prototyping, +data-driven development, and automated testing. We make CARLOS and example +implementations of the three use cases publicly available at +github.com/ika-rwth-aachen/carlos + +
+
+ comment: 7 pages, 5 figures, 1 table; Accepted to be published as part of the + 35th IEEE Intelligent Vehicles Symposium (IV), Jeju Island, Korea, June 2-5, + 2024 +
+
+
+
+
+ + ♻ ☆ Exploiting Contextual Structure to Generate Useful Auxiliary Tasks + + +
+ Reinforcement learning requires interaction with an environment, which is +expensive for robots. This constraint necessitates approaches that work with +limited environmental interaction by maximizing the reuse of previous +experiences. We propose an approach that maximizes experience reuse while +learning to solve a given task by generating and simultaneously learning useful +auxiliary tasks. To generate these tasks, we construct an abstract temporal +logic representation of the given task and leverage large language models to +generate context-aware object embeddings that facilitate object replacements. +Counterfactual reasoning and off-policy methods allow us to simultaneously +learn these auxiliary tasks while solving the given target task. We combine +these insights into a novel framework for multitask reinforcement learning and +experimentally show that our generated auxiliary tasks share similar underlying +exploration requirements as the given task, thereby maximizing the utility of +directed exploration. Our approach allows agents to automatically learn +additional useful policies without extra environment interaction. + +
+
+
+
+
+ + ♻ ☆ DeepIPC: Deeply Integrated Perception and Control for an Autonomous + Vehicle in Real Environments + + +
+ In this work, we introduce DeepIPC, a novel end-to-end model tailored for +autonomous driving, which seamlessly integrates perception and control tasks. +Unlike traditional models that handle these tasks separately, DeepIPC +innovatively combines a perception module, which processes RGBD images for +semantic segmentation and generates bird's eye view (BEV) mappings, with a +controller module that utilizes these insights along with GNSS and angular +speed measurements to accurately predict navigational waypoints. This +integration allows DeepIPC to efficiently translate complex environmental data +into actionable driving commands. Our comprehensive evaluation demonstrates +DeepIPC's superior performance in terms of drivability and multi-task +efficiency across diverse real-world scenarios, setting a new benchmark for +end-to-end autonomous driving systems with a leaner model architecture. The +experimental results underscore DeepIPC's potential to significantly enhance +autonomous vehicular navigation, promising a step forward in the development of +autonomous driving technologies. For further insights and replication, we will +make our code and datasets available at https://github.com/oskarnatan/DeepIPC. + +
+
+ comment: Accepted for Publication in IEEE Access +
+
+
+
+
+ + ♻ ☆ DeepIPCv2: LiDAR-powered Robust Environmental Perception and + Navigational Control for Autonomous Vehicle + + +
+ We present DeepIPCv2, an autonomous driving model that perceives the +environment using a LiDAR sensor for more robust drivability, especially when +driving under poor illumination conditions where everything is not clearly +visible. DeepIPCv2 takes a set of LiDAR point clouds as the main perception +input. Since point clouds are not affected by illumination changes, they can +provide a clear observation of the surroundings no matter what the condition +is. This results in a better scene understanding and stable features provided +by the perception module to support the controller module in estimating +navigational control properly. To evaluate its performance, we conduct several +tests by deploying the model to predict a set of driving records and perform +real automated driving under three different conditions. We also conduct +ablation and comparative studies with some recent models to justify its +performance. Based on the experimental results, DeepIPCv2 shows a robust +performance by achieving the best drivability in all driving scenarios. +Furthermore, to support future research, we will upload the codes and data to +https://github.com/oskarnatan/DeepIPCv2. + +
+
+
+
+
+ + ♻ ☆ Learning Generalizable Tool-use Skills through Trajectory Generation + + +
+ Autonomous systems that efficiently utilize tools can assist humans in +completing many common tasks such as cooking and cleaning. However, current +systems fall short of matching human-level of intelligence in terms of adapting +to novel tools. Prior works based on affordance often make strong assumptions +about the environments and cannot scale to more complex, contact-rich tasks. In +this work, we tackle this challenge and explore how agents can learn to use +previously unseen tools to manipulate deformable objects. We propose to learn a +generative model of the tool-use trajectories as a sequence of tool point +clouds, which generalizes to different tool shapes. Given any novel tool, we +first generate a tool-use trajectory and then optimize the sequence of tool +poses to align with the generated trajectory. We train a single model on four +different challenging deformable object manipulation tasks, using demonstration +data from only one tool per task. The model generalizes to various novel tools, +significantly outperforming baselines. We further test our trained policy in +the real world with unseen tools, where it achieves the performance comparable +to human. Additional materials can be found on our project website: +https://sites.google.com/view/toolgen. + +
+
+
+
+
+ + ♻ ☆ Real-time Control of Electric Autonomous Mobility-on-Demand Systems via + Graph Reinforcement Learning + + +
+ Operators of Electric Autonomous Mobility-on-Demand (E-AMoD) fleets need to +make several real-time decisions such as matching available vehicles to ride +requests, rebalancing idle vehicles to areas of high demand, and charging +vehicles to ensure sufficient range. While this problem can be posed as a +linear program that optimizes flows over a space-charge-time graph, the size of +the resulting optimization problem does not allow for real-time implementation +in realistic settings. In this work, we present the E-AMoD control problem +through the lens of reinforcement learning and propose a graph network-based +framework to achieve drastically improved scalability and superior performance +over heuristics. Specifically, we adopt a bi-level formulation where we (1) +leverage a graph network-based RL agent to specify a desired next state in the +space-charge graph, and (2) solve more tractable linear programs to best +achieve the desired state while ensuring feasibility. Experiments using +real-world data from San Francisco and New York City show that our approach +achieves up to 89% of the profits of the theoretically-optimal solution while +achieving more than a 100x speedup in computational time. We further highlight +promising zero-shot transfer capabilities of our learned policy on tasks such +as inter-city generalization and service area expansion, thus showing the +utility, scalability, and flexibility of our framework. Finally, our approach +outperforms the best domain-specific heuristics with comparable runtimes, with +an increase in profits by up to 3.2x. + +
+
+ comment: 9 pages, revised SF travel data, includes additional experimental + results, content and clarification revisions per reviewer feedback, and typo + fixes +
+
+
+
+
+ + ♻ ☆ World-Model-Based Control for Industrial box-packing of Multiple Objects + using NewtonianVAE + + +
+ The process of industrial box-packing, which involves the accurate placement +of multiple objects, requires high-accuracy positioning and sequential actions. +When a robot is tasked with placing an object at a specific location with high +accuracy, it is important not only to have information about the location of +the object to be placed, but also the posture of the object grasped by the +robotic hand. Often, industrial box-packing requires the sequential placement +of identically shaped objects into a single box. The robot's action should be +determined by the same learned model. In factories, new kinds of products often +appear and there is a need for a model that can easily adapt to them. +Therefore, it should be easy to collect data to train the model. In this study, +we designed a robotic system to automate real-world industrial tasks, employing +a vision-based learning control model. We propose in-hand-view-sensitive +Newtonian variational autoencoder (ihVS-NVAE), which employs an RGB camera to +obtain in-hand postures of objects. We demonstrate that our model, trained for +a single object-placement task, can handle sequential tasks without additional +training. To evaluate efficacy of the proposed model, we employed a real robot +to perform sequential industrial box-packing of multiple objects. Results +showed that the proposed model achieved a 100% success rate in industrial +box-packing tasks, thereby outperforming the state-of-the-art and conventional +approaches, underscoring its superior effectiveness and potential in industrial +tasks. + +
+
+ comment: 7 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Expectable Motion Unit: Avoiding Hazards From Human Involuntary Motions + in Human-Robot Interaction + + +
+ In robotics, many control and planning schemes have been developed to ensure +human physical safety in human- robot interaction. The human psychological +state and the ex- pectation towards the robot, however, are typically +neglected. Even if the robot behaviour is regarded as biomechanically safe, +humans may still react with a rapid involuntary motion (IM) caused by a startle +or surprise. Such sudden, uncontrolled motions can jeopardize safety and should +be prevented by any means. In this letter, we propose the Expectable Motion +Unit (EMU), which ensures that a certain probability of IM occurrence is not +exceeded in a typical HRI setting. Based on a model of IM occurrence generated +through an experiment with 29 participants, we establish the mapping between +robot velocity, robot-human distance, and the relative frequency of IM +occurrence. This mapping is processed towards a real-time capable robot motion +generator that limits the robot velocity during task execution if necessary. +The EMU is combined in a holistic safety framework that integrates both the +physical and psychological safety knowledge. A validation experiment showed +that the EMU successfully avoids human IM in five out of six cases. + +
+
+
+
+
+ + ♻ ☆ Learning Optimal Topology for Ad-hoc Robot Networks + + +
+ In this paper, we synthesize a data-driven method to predict the optimal +topology of an ad-hoc robot network. This problem is technically a multi-task +classification problem. However, we divide it into a class of multi-class +classification problems that can be more efficiently solved. For this purpose, +we first compose an algorithm to create ground-truth optimal topologies +associated with various configurations of a robot network. This algorithm +incorporates a complex collection of optimality criteria that our learning +model successfully manages to learn. This model is an stacked ensemble whose +output is the topology prediction for a particular robot. Each stacked ensemble +instance constitutes three low-level estimators whose outputs will be +aggregated by a high-level boosting blender. Applying our model to a network of +10 robots displays over 80% accuracy in the prediction of optimal topologies +corresponding to various configurations of the cited network. + +
+
+ comment: This version is the one published in IEEE Robotics and Automation + Letters +
+
+
+
+
+ + ♻ ☆ An Anomaly Behavior Analysis Framework for Securing Autonomous Vehicle + Perception + + +
+ As a rapidly growing cyber-physical platform, Autonomous Vehicles (AVs) are +encountering more security challenges as their capabilities continue to expand. +In recent years, adversaries are actively targeting the perception sensors of +autonomous vehicles with sophisticated attacks that are not easily detected by +the vehicles' control systems. This work proposes an Anomaly Behavior Analysis +approach to detect a perception sensor attack against an autonomous vehicle. +The framework relies on temporal features extracted from a physics-based +autonomous vehicle behavior model to capture the normal behavior of vehicular +perception in autonomous driving. By employing a combination of model-based +techniques and machine learning algorithms, the proposed framework +distinguishes between normal and abnormal vehicular perception behavior. To +demonstrate the application of the framework in practice, we performed a depth +camera attack experiment on an autonomous vehicle testbed and generated an +extensive dataset. We validated the effectiveness of the proposed framework +using this real-world data and released the dataset for public access. To our +knowledge, this dataset is the first of its kind and will serve as a valuable +resource for the research community in evaluating their intrusion detection +techniques effectively. + +
+
+ comment: 20th ACS/IEEE International Conference on Computer Systems and + Applications (IEEE AICCSA 2023) +
+
+
+
+
+ + ♻ ☆ Programmatic Imitation Learning from Unlabeled and Noisy Demonstrations + + +
+ Imitation Learning (IL) is a promising paradigm for teaching robots to +perform novel tasks using demonstrations. Most existing approaches for IL +utilize neural networks (NN), however, these methods suffer from several +well-known limitations: they 1) require large amounts of training data, 2) are +hard to interpret, and 3) are hard to repair and adapt. There is an emerging +interest in programmatic imitation learning (PIL), which offers significant +promise in addressing the above limitations. In PIL, the learned policy is +represented in a programming language, making it amenable to interpretation and +repair. However, state-of-the-art PIL algorithms assume access to action labels +and struggle to learn from noisy real-world demonstrations. In this paper, we +propose PLUNDER, a novel PIL algorithm that integrates a probabilistic program +synthesizer in an iterative Expectation-Maximization (EM) framework to address +these shortcomings. Unlike existing PIL approaches, PLUNDER synthesizes +probabilistic programmatic policies that are particularly well-suited for +modeling the uncertainties inherent in real-world demonstrations. Our approach +leverages an EM loop to simultaneously infer the missing action labels and the +most likely probabilistic policy. We benchmark PLUNDER against several +established IL techniques, and demonstrate its superiority across five +challenging imitation learning tasks under noise. PLUNDER policies achieve 95% +accuracy in matching the given demonstrations, outperforming the next best +baseline by 19%. Additionally, policies generated by PLUNDER successfully +complete the tasks 17% more frequently than the nearest baseline. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 140 + +
+
+
+ + ☆ Know Your Neighbors: Improving Single-View Reconstruction via Spatial + Vision-Language Reasoning CVPR 2024 + + +
+ Recovering the 3D scene geometry from a single view is a fundamental yet +ill-posed problem in computer vision. While classical depth estimation methods +infer only a 2.5D scene representation limited to the image plane, recent +approaches based on radiance fields reconstruct a full 3D representation. +However, these methods still struggle with occluded regions since inferring +geometry without visual observation requires (i) semantic knowledge of the +surroundings, and (ii) reasoning about spatial context. We propose KYN, a novel +method for single-view scene reconstruction that reasons about semantic and +spatial context to predict each point's density. We introduce a vision-language +modulation module to enrich point features with fine-grained semantic +information. We aggregate point representations across the scene through a +language-guided spatial attention mechanism to yield per-point density +predictions aware of the 3D semantic context. We show that KYN improves 3D +shape recovery compared to predicting density for each 3D point in isolation. +We achieve state-of-the-art results in scene and object reconstruction on +KITTI-360, and show improved zero-shot generalization compared to prior work. +Project page: https://ruili3.github.io/kyn. + +
+
+ comment: CVPR 2024. Project page: https://ruili3.github.io/kyn +
+
+
+
+
+ + ☆ OW-VISCap: Open-World Video Instance Segmentation and Captioning + + +
+ Open-world video instance segmentation is an important video understanding +task. Yet most methods either operate in a closed-world setting, require an +additional user-input, or use classic region-based proposals to identify never +before seen objects. Further, these methods only assign a one-word label to +detected objects, and don't generate rich object-centric descriptions. They +also often suffer from highly overlapping predictions. To address these issues, +we propose Open-World Video Instance Segmentation and Captioning (OW-VISCap), +an approach to jointly segment, track, and caption previously seen or unseen +objects in a video. For this, we introduce open-world object queries to +discover never before seen objects without additional user-input. We generate +rich and descriptive object-centric captions for each detected object via a +masked attention augmented LLM input. We introduce an inter-query contrastive +loss to ensure that the object queries differ from one another. Our generalized +approach matches or surpasses state-of-the-art on three tasks: open-world video +instance segmentation on the BURST dataset, dense video object captioning on +the VidSTG dataset, and closed-world video instance segmentation on the OVIS +dataset. + +
+
+ comment: Project page: https://anwesachoudhuri.github.io/OpenWorldVISCap/ +
+
+
+
+
+ + ☆ MVD-Fusion: Single-view 3D via Depth-consistent Multi-view Generation + + +
+ We present MVD-Fusion: a method for single-view 3D inference via generative +modeling of multi-view-consistent RGB-D images. While recent methods pursuing +3D inference advocate learning novel-view generative models, these generations +are not 3D-consistent and require a distillation process to generate a 3D +output. We instead cast the task of 3D inference as directly generating +mutually-consistent multiple views and build on the insight that additionally +inferring depth can provide a mechanism for enforcing this consistency. +Specifically, we train a denoising diffusion model to generate multi-view RGB-D +images given a single RGB input image and leverage the (intermediate noisy) +depth estimates to obtain reprojection-based conditioning to maintain +multi-view consistency. We train our model using large-scale synthetic dataset +Obajverse as well as the real-world CO3D dataset comprising of generic camera +viewpoints. We demonstrate that our approach can yield more accurate synthesis +compared to recent state-of-the-art, including distillation-based 3D inference +and prior multi-view generation methods. We also evaluate the geometry induced +by our multi-view depth prediction and find that it yields a more accurate +representation than other direct 3D inference approaches. + +
+
+ comment: Project page: https://mvd-fusion.github.io/ +
+
+
+
+
+ + ☆ RaFE: Generative Radiance Fields Restoration + + +
+ NeRF (Neural Radiance Fields) has demonstrated tremendous potential in novel +view synthesis and 3D reconstruction, but its performance is sensitive to input +image quality, which struggles to achieve high-fidelity rendering when provided +with low-quality sparse input viewpoints. Previous methods for NeRF restoration +are tailored for specific degradation type, ignoring the generality of +restoration. To overcome this limitation, we propose a generic radiance fields +restoration pipeline, named RaFE, which applies to various types of +degradations, such as low resolution, blurriness, noise, compression artifacts, +or their combinations. Our approach leverages the success of off-the-shelf 2D +restoration methods to recover the multi-view images individually. Instead of +reconstructing a blurred NeRF by averaging inconsistencies, we introduce a +novel approach using Generative Adversarial Networks (GANs) for NeRF generation +to better accommodate the geometric and appearance inconsistencies present in +the multi-view images. Specifically, we adopt a two-level tri-plane +architecture, where the coarse level remains fixed to represent the low-quality +NeRF, and a fine-level residual tri-plane to be added to the coarse level is +modeled as a distribution with GAN to capture potential variations in +restoration. We validate RaFE on both synthetic and real cases for various +restoration tasks, demonstrating superior performance in both quantitative and +qualitative evaluations, surpassing other 3D restoration methods specific to +single task. Please see our project website +https://zkaiwu.github.io/RaFE-Project/. + +
+
+ comment: Project Page: https://zkaiwu.github.io/RaFE-Project/ +
+
+
+
+
+ + ☆ CoMat: Aligning Text-to-Image Diffusion Model with Image-to-Text Concept + Matching + + +
+ Diffusion models have demonstrated great success in the field of +text-to-image generation. However, alleviating the misalignment between the +text prompts and images is still challenging. The root reason behind the +misalignment has not been extensively investigated. We observe that the +misalignment is caused by inadequate token attention activation. We further +attribute this phenomenon to the diffusion model's insufficient condition +utilization, which is caused by its training paradigm. To address the issue, we +propose CoMat, an end-to-end diffusion model fine-tuning strategy with an +image-to-text concept matching mechanism. We leverage an image captioning model +to measure image-to-text alignment and guide the diffusion model to revisit +ignored tokens. A novel attribute concentration module is also proposed to +address the attribute binding problem. Without any image or human preference +data, we use only 20K text prompts to fine-tune SDXL to obtain CoMat-SDXL. +Extensive experiments show that CoMat-SDXL significantly outperforms the +baseline model SDXL in two text-to-image alignment benchmarks and achieves +start-of-the-art performance. + +
+
+ comment: Project Page: https://caraj7.github.io/comat +
+
+
+
+
+ + ☆ The More You See in 2D, the More You Perceive in 3D + + +
+ Humans can infer 3D structure from 2D images of an object based on past +experience and improve their 3D understanding as they see more images. Inspired +by this behavior, we introduce SAP3D, a system for 3D reconstruction and novel +view synthesis from an arbitrary number of unposed images. Given a few unposed +images of an object, we adapt a pre-trained view-conditioned diffusion model +together with the camera poses of the images via test-time fine-tuning. The +adapted diffusion model and the obtained camera poses are then utilized as +instance-specific priors for 3D reconstruction and novel view synthesis. We +show that as the number of input images increases, the performance of our +approach improves, bridging the gap between optimization-based prior-less 3D +reconstruction methods and single-image-to-3D diffusion-based methods. We +demonstrate our system on real images as well as standard synthetic benchmarks. +Our ablation studies confirm that this adaption behavior is key for more +accurate 3D understanding. + +
+
+ comment: Project page: https://sap3d.github.io/ +
+
+
+
+
+ + ☆ OpenNeRF: Open Set 3D Neural Scene Segmentation with Pixel-Wise Features + and Rendered Novel Views ICLR 2024 + + +
+ Large visual-language models (VLMs), like CLIP, enable open-set image +segmentation to segment arbitrary concepts from an image in a zero-shot manner. +This goes beyond the traditional closed-set assumption, i.e., where models can +only segment classes from a pre-defined training set. More recently, first +works on open-set segmentation in 3D scenes have appeared in the literature. +These methods are heavily influenced by closed-set 3D convolutional approaches +that process point clouds or polygon meshes. However, these 3D scene +representations do not align well with the image-based nature of the +visual-language models. Indeed, point cloud and 3D meshes typically have a +lower resolution than images and the reconstructed 3D scene geometry might not +project well to the underlying 2D image sequences used to compute pixel-aligned +CLIP features. To address these challenges, we propose OpenNeRF which naturally +operates on posed images and directly encodes the VLM features within the NeRF. +This is similar in spirit to LERF, however our work shows that using pixel-wise +VLM features (instead of global CLIP features) results in an overall less +complex architecture without the need for additional DINO regularization. Our +OpenNeRF further leverages NeRF's ability to render novel views and extract +open-set VLM features from areas that are not well observed in the initial +posed images. For 3D point cloud segmentation on the Replica dataset, OpenNeRF +outperforms recent open-vocabulary methods such as LERF and OpenScene by at +least +4.9 mIoU. + +
+
+ comment: ICLR 2024, Project page: https://opennerf.github.io +
+
+
+
+
+ + ☆ Decoupling Static and Hierarchical Motion Perception for Referring Video + Segmentation CVPR 2024 + + +
+ Referring video segmentation relies on natural language expressions to +identify and segment objects, often emphasizing motion clues. Previous works +treat a sentence as a whole and directly perform identification at the +video-level, mixing up static image-level cues with temporal motion cues. +However, image-level features cannot well comprehend motion cues in sentences, +and static cues are not crucial for temporal perception. In fact, static cues +can sometimes interfere with temporal perception by overshadowing motion cues. +In this work, we propose to decouple video-level referring expression +understanding into static and motion perception, with a specific emphasis on +enhancing temporal comprehension. Firstly, we introduce an +expression-decoupling module to make static cues and motion cues perform their +distinct role, alleviating the issue of sentence embeddings overlooking motion +cues. Secondly, we propose a hierarchical motion perception module to capture +temporal information effectively across varying timescales. Furthermore, we +employ contrastive learning to distinguish the motions of visually similar +objects. These contributions yield state-of-the-art performance across five +datasets, including a remarkable $\textbf{9.2%}$ $\mathcal{J\&F}$ improvement +on the challenging $\textbf{MeViS}$ dataset. Code is available at +https://github.com/heshuting555/DsHmp. + +
+
+ comment: CVPR 2024, code: https://github.com/heshuting555/DsHmp +
+
+
+
+
+ + ☆ DiffBody: Human Body Restoration by Imagining with Generative Diffusion + Prior + + +
+ Human body restoration plays a vital role in various applications related to +the human body. Despite recent advances in general image restoration using +generative models, their performance in human body restoration remains +mediocre, often resulting in foreground and background blending, over-smoothing +surface textures, missing accessories, and distorted limbs. Addressing these +challenges, we propose a novel approach by constructing a human body-aware +diffusion model that leverages domain-specific knowledge to enhance +performance. Specifically, we employ a pretrained body attention module to +guide the diffusion model's focus on the foreground, addressing issues caused +by blending between the subject and background. We also demonstrate the value +of revisiting the language modality of the diffusion model in restoration tasks +by seamlessly incorporating text prompt to improve the quality of surface +texture and additional clothing and accessories details. Additionally, we +introduce a diffusion sampler tailored for fine-grained human body parts, +utilizing local semantic information to rectify limb distortions. Lastly, we +collect a comprehensive dataset for benchmarking and advancing the field of +human body restoration. Extensive experimental validation showcases the +superiority of our approach, both quantitatively and qualitatively, over +existing methods. + +
+
+
+
+
+ + ☆ WorDepth: Variational Language Prior for Monocular Depth Estimation + + +
+ Three-dimensional (3D) reconstruction from a single image is an ill-posed +problem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text +description(s) is similarly ill-posed, i.e. spatial arrangements of objects +described. We investigate the question of whether two inherently ambiguous +modalities can be used in conjunction to produce metric-scaled reconstructions. +To test this, we focus on monocular depth estimation, the problem of predicting +a dense depth map from a single image, but with an additional text caption +describing the scene. To this end, we begin by encoding the text caption as a +mean and standard deviation; using a variational framework, we learn the +distribution of the plausible metric reconstructions of 3D scenes corresponding +to the text captions as a prior. To "select" a specific reconstruction or depth +map, we encode the given image through a conditional sampler that samples from +the latent space of the variational text encoder, which is then decoded to the +output depth map. Our approach is trained alternatingly between the text and +image branches: in one optimization step, we predict the mean and standard +deviation from the text description and sample from a standard Gaussian, and in +the other, we sample using a (image) conditional sampler. Once trained, we +directly predict depth from the encoded text using the conditional sampler. We +demonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where +we show that language can consistently improve performance in both. + +
+
+
+
+
+ + ☆ PreAfford: Universal Affordance-Based Pre-Grasping for Diverse Objects + and Environments + + +
+ Robotic manipulation of ungraspable objects with two-finger grippers presents +significant challenges due to the paucity of graspable features, while +traditional pre-grasping techniques, which rely on repositioning objects and +leveraging external aids like table edges, lack the adaptability across object +categories and scenes. Addressing this, we introduce PreAfford, a novel +pre-grasping planning framework that utilizes a point-level affordance +representation and a relay training approach to enhance adaptability across a +broad range of environments and object types, including those previously +unseen. Demonstrated on the ShapeNet-v2 dataset, PreAfford significantly +improves grasping success rates by 69% and validates its practicality through +real-world experiments. This work offers a robust and adaptable solution for +manipulating ungraspable objects. + +
+
+ comment: Project Page: https://air-discover.github.io/PreAfford/ +
+
+
+
+
+ + ☆ Reference-Based 3D-Aware Image Editing with Triplane + + +
+ Generative Adversarial Networks (GANs) have emerged as powerful tools not +only for high-quality image generation but also for real image editing through +manipulation of their interpretable latent spaces. Recent advancements in GANs +include the development of 3D-aware models such as EG3D, characterized by +efficient triplane-based architectures enabling the reconstruction of 3D +geometry from single images. However, scant attention has been devoted to +providing an integrated framework for high-quality reference-based 3D-aware +image editing within this domain. This study addresses this gap by exploring +and demonstrating the effectiveness of EG3D's triplane space for achieving +advanced reference-based edits, presenting a unique perspective on 3D-aware +image editing through our novel pipeline. Our approach integrates the encoding +of triplane features, spatial disentanglement and automatic localization of +features in the triplane domain, and fusion learning for desired image editing. +Moreover, our framework demonstrates versatility across domains, extending its +effectiveness to animal face edits and partial stylization of cartoon +portraits. The method shows significant improvements over relevant 3D-aware +latent editing and 2D reference-based editing methods, both qualitatively and +quantitatively. Project page: https://three-bee.github.io/triplane_edit + +
+
+
+
+
+ + ☆ Robust Concept Erasure Using Task Vectors + + +
+ With the rapid growth of text-to-image models, a variety of techniques have +been suggested to prevent undesirable image generations. Yet, these methods +often only protect against specific user prompts and have been shown to allow +unsafe generations with other inputs. Here we focus on unconditionally erasing +a concept from a text-to-image model rather than conditioning the erasure on +the user's prompt. We first show that compared to input-dependent erasure +methods, concept erasure that uses Task Vectors (TV) is more robust to +unexpected user inputs, not seen during training. However, TV-based erasure can +also affect the core performance of the edited model, particularly when the +required edit strength is unknown. To this end, we propose a method called +Diverse Inversion, which we use to estimate the required strength of the TV +edit. Diverse Inversion finds within the model input space a large set of word +embeddings, each of which induces the generation of the target concept. We find +that encouraging diversity in the set makes our estimation more robust to +unexpected prompts. Finally, we show that Diverse Inversion enables us to apply +a TV edit only to a subset of the model weights, enhancing the erasure +capabilities while better maintaining the core functionality of the model. + +
+
+
+
+
+ + ☆ LCM-Lookahead for Encoder-based Text-to-Image Personalization + + +
+ Recent advancements in diffusion models have introduced fast sampling methods +that can effectively produce high-quality images in just one or a few denoising +steps. Interestingly, when these are distilled from existing diffusion models, +they often maintain alignment with the original model, retaining similar +outputs for similar prompts and seeds. These properties present opportunities +to leverage fast sampling methods as a shortcut-mechanism, using them to create +a preview of denoised outputs through which we can backpropagate image-space +losses. In this work, we explore the potential of using such +shortcut-mechanisms to guide the personalization of text-to-image models to +specific facial identities. We focus on encoder-based personalization +approaches, and demonstrate that by tuning them with a lookahead identity loss, +we can achieve higher identity fidelity, without sacrificing layout diversity +or prompt alignment. We further explore the use of attention sharing mechanisms +and consistent data generation for the task of personalization, and find that +encoder training can benefit from both. + +
+
+ comment: Project page at https://lcm-lookahead.github.io/ +
+
+
+
+
+ + ☆ DeViDe: Faceted medical knowledge for improved medical vision-language + pre-training + + +
+ Vision-language pre-training for chest X-rays has made significant strides, +primarily by utilizing paired radiographs and radiology reports. However, +existing approaches often face challenges in encoding medical knowledge +effectively. While radiology reports provide insights into the current disease +manifestation, medical definitions (as used by contemporary methods) tend to be +overly abstract, creating a gap in knowledge. To address this, we propose +DeViDe, a novel transformer-based method that leverages radiographic +descriptions from the open web. These descriptions outline general visual +characteristics of diseases in radiographs, and when combined with abstract +definitions and radiology reports, provide a holistic snapshot of knowledge. +DeViDe incorporates three key features for knowledge-augmented vision language +alignment: First, a large-language model-based augmentation is employed to +homogenise medical knowledge from diverse sources. Second, this knowledge is +aligned with image information at various levels of granularity. Third, a novel +projection layer is proposed to handle the complexity of aligning each image +with multiple descriptions arising in a multi-label setting. In zero-shot +settings, DeViDe performs comparably to fully supervised models on external +datasets and achieves state-of-the-art results on three large-scale datasets. +Additionally, fine-tuning DeViDe on four downstream tasks and six segmentation +tasks showcases its superior performance across data from diverse +distributions. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2208.04060 by other authors +
+
+
+
+
+ + ☆ On the Efficiency of Convolutional Neural Networks + + +
+ Since the breakthrough performance of AlexNet in 2012, convolutional neural +networks (convnets) have grown into extremely powerful vision models. Deep +learning researchers have used convnets to produce accurate results that were +unachievable a decade ago. Yet computer scientists make computational +efficiency their primary objective. Accuracy with exorbitant cost is not +acceptable; an algorithm must also minimize its computational requirements. +Confronted with the daunting computation that convnets use, deep learning +researchers also became interested in efficiency. Researchers applied +tremendous effort to find the convnet architectures that have the greatest +efficiency. However, skepticism grew among researchers and engineers alike +about the relevance of arithmetic complexity. Contrary to the prevailing view +that latency and arithmetic complexity are irreconcilable, a simple formula +relates both through computational efficiency. This insight enabled us to +co-optimize the separate factors that determine latency. We observed that the +degenerate conv2d layers that produce the best accuracy-complexity trade-off +also have low operational intensity. Therefore, kernels that implement these +layers use significant memory resources. We solved this optimization problem +with block-fusion kernels that implement all layers of a residual block, +thereby creating temporal locality, avoiding communication, and reducing +workspace size. Our ConvFirst model with block-fusion kernels ran approximately +four times as fast as the ConvNeXt baseline with PyTorch Inductor, at equal +accuracy on the ImageNet-1K classification task. Our unified approach to +convnet efficiency envisions a new era of models and kernels that achieve +greater accuracy at lower cost. + +
+
+
+
+
+ + ☆ Per-Gaussian Embedding-Based Deformation for Deformable 3D Gaussian + Splatting + + +
+ As 3D Gaussian Splatting (3DGS) provides fast and high-quality novel view +synthesis, it is a natural extension to deform a canonical 3DGS to multiple +frames. However, previous works fail to accurately reconstruct dynamic scenes, +especially 1) static parts moving along nearby dynamic parts, and 2) some +dynamic areas are blurry. We attribute the failure to the wrong design of the +deformation field, which is built as a coordinate-based function. This approach +is problematic because 3DGS is a mixture of multiple fields centered at the +Gaussians, not just a single coordinate-based framework. To resolve this +problem, we define the deformation as a function of per-Gaussian embeddings and +temporal embeddings. Moreover, we decompose deformations as coarse and fine +deformations to model slow and fast movements, respectively. Also, we introduce +an efficient training strategy for faster convergence and higher quality. +Project page: https://jeongminb.github.io/e-d3dgs/ + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ InsectMamba: Insect Pest Classification with State Space Model + + +
+ The classification of insect pests is a critical task in agricultural +technology, vital for ensuring food security and environmental sustainability. +However, the complexity of pest identification, due to factors like high +camouflage and species diversity, poses significant obstacles. Existing methods +struggle with the fine-grained feature extraction needed to distinguish between +closely related pest species. Although recent advancements have utilized +modified network structures and combined deep learning approaches to improve +accuracy, challenges persist due to the similarity between pests and their +surroundings. To address this problem, we introduce InsectMamba, a novel +approach that integrates State Space Models (SSMs), Convolutional Neural +Networks (CNNs), Multi-Head Self-Attention mechanism (MSA), and Multilayer +Perceptrons (MLPs) within Mix-SSM blocks. This integration facilitates the +extraction of comprehensive visual features by leveraging the strengths of each +encoding strategy. A selective module is also proposed to adaptively aggregate +these features, enhancing the model's ability to discern pest characteristics. +InsectMamba was evaluated against strong competitors across five insect pest +classification datasets. The results demonstrate its superior performance and +verify the significance of each model component by an ablation study. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ SemGrasp: Semantic Grasp Generation via Language Aligned Discretization + + +
+ Generating natural human grasps necessitates consideration of not just object +geometry but also semantic information. Solely depending on object shape for +grasp generation confines the applications of prior methods in downstream +tasks. This paper presents a novel semantic-based grasp generation method, +termed SemGrasp, which generates a static human grasp pose by incorporating +semantic information into the grasp representation. We introduce a discrete +representation that aligns the grasp space with semantic space, enabling the +generation of grasp postures in accordance with language instructions. A +Multimodal Large Language Model (MLLM) is subsequently fine-tuned, integrating +object, grasp, and language within a unified semantic space. To facilitate the +training of SemGrasp, we have compiled a large-scale, grasp-text-aligned +dataset named CapGrasp, featuring about 260k detailed captions and 50k diverse +grasps. Experimental findings demonstrate that SemGrasp efficiently generates +natural human grasps in alignment with linguistic intentions. Our code, models, +and dataset are available publicly at: https://kailinli.github.io/SemGrasp. + +
+
+
+
+
+ + ☆ Towards more realistic human motion prediction with attention to motion + coordination + + +
+ Joint relation modeling is a curial component in human motion prediction. +Most existing methods rely on skeletal-based graphs to build the joint +relations, where local interactive relations between joint pairs are well +learned. However, the motion coordination, a global joint relation reflecting +the simultaneous cooperation of all joints, is usually weakened because it is +learned from part to whole progressively and asynchronously. Thus, the final +predicted motions usually appear unrealistic. To tackle this issue, we learn a +medium, called coordination attractor (CA), from the spatiotemporal features of +motion to characterize the global motion features, which is subsequently used +to build new relative joint relations. Through the CA, all joints are related +simultaneously, and thus the motion coordination of all joints can be better +learned. Based on this, we further propose a novel joint relation modeling +module, Comprehensive Joint Relation Extractor (CJRE), to combine this motion +coordination with the local interactions between joint pairs in a unified +manner. Additionally, we also present a Multi-timescale Dynamics Extractor +(MTDE) to extract enriched dynamics from the raw position information for +effective prediction. Extensive experiments show that the proposed framework +outperforms state-of-the-art methods in both short- and long-term predictions +on H3.6M, CMU-Mocap, and 3DPW. + +
+
+ comment: Accepted by TCSVT +
+
+
+
+
+ + ☆ DreamScene: 3D Gaussian-based Text-to-3D Scene Generation via Formation + Pattern Sampling + + +
+ Text-to-3D scene generation holds immense potential for the gaming, film, and +architecture sectors. Despite significant progress, existing methods struggle +with maintaining high quality, consistency, and editing flexibility. In this +paper, we propose DreamScene, a 3D Gaussian-based novel text-to-3D scene +generation framework, to tackle the aforementioned three challenges mainly via +two strategies. First, DreamScene employs Formation Pattern Sampling (FPS), a +multi-timestep sampling strategy guided by the formation patterns of 3D +objects, to form fast, semantically rich, and high-quality representations. FPS +uses 3D Gaussian filtering for optimization stability, and leverages +reconstruction techniques to generate plausible textures. Second, DreamScene +employs a progressive three-stage camera sampling strategy, specifically +designed for both indoor and outdoor settings, to effectively ensure +object-environment integration and scene-wide 3D consistency. Last, DreamScene +enhances scene editing flexibility by integrating objects and environments, +enabling targeted adjustments. Extensive experiments validate DreamScene's +superiority over current state-of-the-art techniques, heralding its +wide-ranging potential for diverse applications. Code and demos will be +released at https://dreamscene-project.github.io . + +
+
+
+
+
+ + ☆ TinyVQA: Compact Multimodal Deep Neural Network for Visual Question + Answering on Resource-Constrained Devices + + +
+ Traditional machine learning models often require powerful hardware, making +them unsuitable for deployment on resource-limited devices. Tiny Machine +Learning (tinyML) has emerged as a promising approach for running machine +learning models on these devices, but integrating multiple data modalities into +tinyML models still remains a challenge due to increased complexity, latency, +and power consumption. This paper proposes TinyVQA, a novel multimodal deep +neural network for visual question answering tasks that can be deployed on +resource-constrained tinyML hardware. TinyVQA leverages a supervised +attention-based model to learn how to answer questions about images using both +vision and language modalities. Distilled knowledge from the supervised +attention-based VQA model trains the memory aware compact TinyVQA model and low +bit-width quantization technique is employed to further compress the model for +deployment on tinyML devices. The TinyVQA model was evaluated on the FloodNet +dataset, which is used for post-disaster damage assessment. The compact model +achieved an accuracy of 79.5%, demonstrating the effectiveness of TinyVQA for +real-world applications. Additionally, the model was deployed on a Crazyflie +2.0 drone, equipped with an AI deck and GAP8 microprocessor. The TinyVQA model +achieved low latencies of 56 ms and consumes 693 mW power while deployed on the +tiny drone, showcasing its suitability for resource-constrained embedded +systems. + +
+
+ comment: Accepted as a full paper by the tinyML Research Symposium 2024 +
+
+
+
+
+ + ☆ Terrain Point Cloud Inpainting via Signal Decomposition + + +
+ The rapid development of 3D acquisition technology has made it possible to +obtain point clouds of real-world terrains. However, due to limitations in +sensor acquisition technology or specific requirements, point clouds often +contain defects such as holes with missing data. Inpainting algorithms are +widely used to patch these holes. However, existing traditional inpainting +algorithms rely on precise hole boundaries, which limits their ability to +handle cases where the boundaries are not well-defined. On the other hand, +learning-based completion methods often prioritize reconstructing the entire +point cloud instead of solely focusing on hole filling. Based on the fact that +real-world terrain exhibits both global smoothness and rich local detail, we +propose a novel representation for terrain point clouds. This representation +can help to repair the holes without clear boundaries. Specifically, it +decomposes terrains into low-frequency and high-frequency components, which are +represented by B-spline surfaces and relative height maps respectively. In this +way, the terrain point cloud inpainting problem is transformed into a B-spline +surface fitting and 2D image inpainting problem. By solving the two problems, +the highly complex and irregular holes on the terrain point clouds can be +well-filled, which not only satisfies the global terrain undulation but also +exhibits rich geometric details. The experimental results also demonstrate the +effectiveness of our method. + +
+
+
+
+
+ + ☆ PointInfinity: Resolution-Invariant Point Diffusion Models CVPR 2024 + + +
+ We present PointInfinity, an efficient family of point cloud diffusion +models. Our core idea is to use a transformer-based architecture with a +fixed-size, resolution-invariant latent representation. This enables efficient +training with low-resolution point clouds, while allowing high-resolution point +clouds to be generated during inference. More importantly, we show that scaling +the test-time resolution beyond the training resolution improves the fidelity +of generated point clouds and surfaces. We analyze this phenomenon and draw a +link to classifier-free guidance commonly used in diffusion models, +demonstrating that both allow trading off fidelity and variability during +inference. Experiments on CO3D show that PointInfinity can efficiently generate +high-resolution point clouds (up to 131k points, 31 times more than Point-E) +with state-of-the-art quality. + +
+
+ comment: Accepted to CVPR 2024, project website at + https://zixuanh.com/projects/pointinfinity +
+
+
+
+
+ + ☆ Segmentation-Guided Knee Radiograph Generation using Conditional + Diffusion Models + + +
+ Deep learning-based medical image processing algorithms require +representative data during development. In particular, surgical data might be +difficult to obtain, and high-quality public datasets are limited. To overcome +this limitation and augment datasets, a widely adopted solution is the +generation of synthetic images. In this work, we employ conditional diffusion +models to generate knee radiographs from contour and bone segmentations. +Remarkably, two distinct strategies are presented by incorporating the +segmentation as a condition into the sampling and training process, namely, +conditional sampling and conditional training. The results demonstrate that +both methods can generate realistic images while adhering to the conditioning +segmentation. The conditional training method outperforms the conditional +sampling method and the conventional U-Net. + +
+
+
+
+
+ + ☆ Is CLIP the main roadblock for fine-grained open-world perception? + + +
+ Modern applications increasingly demand flexible computer vision models that +adapt to novel concepts not encountered during training. This necessity is +pivotal in emerging domains like extended reality, robotics, and autonomous +driving, which require the ability to respond to open-world stimuli. A key +ingredient is the ability to identify objects based on free-form textual +queries defined at inference time - a task known as open-vocabulary object +detection. Multimodal backbones like CLIP are the main enabling technology for +current open-world perception solutions. Despite performing well on generic +queries, recent studies highlighted limitations on the fine-grained recognition +capabilities in open-vocabulary settings - i.e., for distinguishing subtle +object features like color, shape, and material. In this paper, we perform a +detailed examination of these open-vocabulary object recognition limitations to +find the root cause. We evaluate the performance of CLIP, the most commonly +used vision-language backbone, against a fine-grained object-matching +benchmark, revealing interesting analogies between the limitations of +open-vocabulary object detectors and their backbones. Experiments suggest that +the lack of fine-grained understanding is caused by the poor separability of +object characteristics in the CLIP latent space. Therefore, we try to +understand whether fine-grained knowledge is present in CLIP embeddings but not +exploited at inference time due, for example, to the unsuitability of the +cosine similarity matching function, which may discard important object +characteristics. Our preliminary experiments show that simple CLIP latent-space +re-projections help separate fine-grained concepts, paving the way towards the +development of backbones inherently able to process fine-grained details. The +code for reproducing these experiments is available at +https://github.com/lorebianchi98/FG-CLIP. + +
+
+
+
+
+ + ☆ If It's Not Enough, Make It So: Reducing Authentic Data Demand in Face + Recognition through Synthetic Faces + + +
+ Recent advances in deep face recognition have spurred a growing demand for +large, diverse, and manually annotated face datasets. Acquiring authentic, +high-quality data for face recognition has proven to be a challenge, primarily +due to privacy concerns. Large face datasets are primarily sourced from +web-based images, lacking explicit user consent. In this paper, we examine +whether and how synthetic face data can be used to train effective face +recognition models with reduced reliance on authentic images, thereby +mitigating data collection concerns. First, we explored the performance gap +among recent state-of-the-art face recognition models, trained with synthetic +data only and authentic (scarce) data only. Then, we deepened our analysis by +training a state-of-the-art backbone with various combinations of synthetic and +authentic data, gaining insights into optimizing the limited use of the latter +for verification accuracy. Finally, we assessed the effectiveness of data +augmentation approaches on synthetic and authentic data, with the same goal in +mind. Our results highlighted the effectiveness of FR trained on combined +datasets, particularly when combined with appropriate augmentation techniques. + +
+
+ comment: Accepted as a full paper at FG 2024 main track +
+
+
+
+
+ + ☆ COMO: Compact Mapping and Odometry + + +
+ We present COMO, a real-time monocular mapping and odometry system that +encodes dense geometry via a compact set of 3D anchor points. Decoding anchor +point projections into dense geometry via per-keyframe depth covariance +functions guarantees that depth maps are joined together at visible anchor +points. The representation enables joint optimization of camera poses and dense +geometry, intrinsic 3D consistency, and efficient second-order inference. To +maintain a compact yet expressive map, we introduce a frontend that leverages +the covariance function for tracking and initializing potentially visually +indistinct 3D points across frames. Altogether, we introduce a real-time system +capable of estimating accurate poses and consistent geometry. + +
+
+
+
+
+ + ☆ HAPNet: Toward Superior RGB-Thermal Scene Parsing via Hybrid, + Asymmetric, and Progressive Heterogeneous Feature Fusion + + +
+ Data-fusion networks have shown significant promise for RGB-thermal scene +parsing. However, the majority of existing studies have relied on symmetric +duplex encoders for heterogeneous feature extraction and fusion, paying +inadequate attention to the inherent differences between RGB and thermal +modalities. Recent progress in vision foundation models (VFMs) trained through +self-supervision on vast amounts of unlabeled data has proven their ability to +extract informative, general-purpose features. However, this potential has yet +to be fully leveraged in the domain. In this study, we take one step toward +this new research area by exploring a feasible strategy to fully exploit VFM +features for RGB-thermal scene parsing. Specifically, we delve deeper into the +unique characteristics of RGB and thermal modalities, thereby designing a +hybrid, asymmetric encoder that incorporates both a VFM and a convolutional +neural network. This design allows for more effective extraction of +complementary heterogeneous features, which are subsequently fused in a +dual-path, progressive manner. Moreover, we introduce an auxiliary task to +further enrich the local semantics of the fused features, thereby improving the +overall performance of RGB-thermal scene parsing. Our proposed HAPNet, equipped +with all these components, demonstrates superior performance compared to all +other state-of-the-art RGB-thermal scene parsing networks, achieving top ranks +across three widely used public RGB-thermal scene parsing datasets. We believe +this new paradigm has opened up new opportunities for future developments in +data-fusion scene parsing approaches. + +
+
+ comment: 12 pages, 4figures +
+
+
+
+
+ + ☆ SDPose: Tokenized Pose Estimation via Circulation-Guide + Self-Distillation CVPR 2024 + + +
+ Recently, transformer-based methods have achieved state-of-the-art prediction +quality on human pose estimation(HPE). Nonetheless, most of these +top-performing transformer-based models are too computation-consuming and +storage-demanding to deploy on edge computing platforms. Those +transformer-based models that require fewer resources are prone to +under-fitting due to their smaller scale and thus perform notably worse than +their larger counterparts. Given this conundrum, we introduce SDPose, a new +self-distillation method for improving the performance of small +transformer-based models. To mitigate the problem of under-fitting, we design a +transformer module named Multi-Cycled Transformer(MCT) based on multiple-cycled +forwards to more fully exploit the potential of small model parameters. +Further, in order to prevent the additional inference compute-consuming brought +by MCT, we introduce a self-distillation scheme, extracting the knowledge from +the MCT module to a naive forward model. Specifically, on the MSCOCO validation +dataset, SDPose-T obtains 69.7% mAP with 4.4M parameters and 1.8 GFLOPs. +Furthermore, SDPose-S-V2 obtains 73.5% mAP on the MSCOCO validation dataset +with 6.2M parameters and 4.7 GFLOPs, achieving a new state-of-the-art among +predominant tiny neural network methods. Our code is available at +https://github.com/MartyrPenink/SDPose. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ DQ-DETR: DETR with Dynamic Query for Tiny Object Detection + + +
+ Despite previous DETR-like methods having performed successfully in generic +object detection, tiny object detection is still a challenging task for them +since the positional information of object queries is not customized for +detecting tiny objects, whose scale is extraordinarily smaller than general +objects. Also, DETR-like methods using a fixed number of queries make them +unsuitable for aerial datasets, which only contain tiny objects, and the +numbers of instances are imbalanced between different images. Thus, we present +a simple yet effective model, named DQ-DETR, which consists of three different +components: categorical counting module, counting-guided feature enhancement, +and dynamic query selection to solve the above-mentioned problems. DQ-DETR uses +the prediction and density maps from the categorical counting module to +dynamically adjust the number of object queries and improve the positional +information of queries. Our model DQ-DETR outperforms previous CNN-based and +DETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2 +dataset, which mostly consists of tiny objects. + +
+
+
+
+
+ + ☆ AdaGlimpse: Active Visual Exploration with Arbitrary Glimpse Position + and Scale + + +
+ Active Visual Exploration (AVE) is a task that involves dynamically selecting +observations (glimpses), which is critical to facilitate comprehension and +navigation within an environment. While modern AVE methods have demonstrated +impressive performance, they are constrained to fixed-scale glimpses from rigid +grids. In contrast, existing mobile platforms equipped with optical zoom +capabilities can capture glimpses of arbitrary positions and scales. To address +this gap between software and hardware capabilities, we introduce AdaGlimpse. +It uses Soft Actor-Critic, a reinforcement learning algorithm tailored for +exploration tasks, to select glimpses of arbitrary position and scale. This +approach enables our model to rapidly establish a general awareness of the +environment before zooming in for detailed analysis. Experimental results +demonstrate that AdaGlimpse surpasses previous methods across various visual +tasks while maintaining greater applicability in realistic AVE scenarios. + +
+
+
+
+
+ + ☆ Towards Automated Movie Trailer Generation CVPR 2024 + + +
+ Movie trailers are an essential tool for promoting films and attracting +audiences. However, the process of creating trailers can be time-consuming and +expensive. To streamline this process, we propose an automatic trailer +generation framework that generates plausible trailers from a full movie by +automating shot selection and composition. Our approach draws inspiration from +machine translation techniques and models the movies and trailers as sequences +of shots, thus formulating the trailer generation problem as a +sequence-to-sequence task. We introduce Trailer Generation Transformer (TGT), a +deep-learning framework utilizing an encoder-decoder architecture. TGT movie +encoder is tasked with contextualizing each movie shot representation via +self-attention, while the autoregressive trailer decoder predicts the feature +representation of the next trailer shot, accounting for the relevance of shots' +temporal order in trailers. Our TGT significantly outperforms previous methods +on a comprehensive suite of metrics. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Performance of computer vision algorithms for fine-grained + classification using crowdsourced insect images + + +
+ With fine-grained classification, we identify unique characteristics to +distinguish among classes of the same super-class. We are focusing on species +recognition in Insecta, as they are critical for biodiversity monitoring and at +the base of many ecosystems. With citizen science campaigns, billions of images +are collected in the wild. Once these are labelled, experts can use them to +create distribution maps. However, the labelling process is time-consuming, +which is where computer vision comes in. The field of computer vision offers a +wide range of algorithms, each with its strengths and weaknesses; how do we +identify the algorithm that is in line with our application? To answer this +question, we provide a full and detailed evaluation of nine algorithms among +deep convolutional networks (CNN), vision transformers (ViT), and +locality-based vision transformers (LBVT) on 4 different aspects: +classification performance, embedding quality, computational cost, and gradient +activity. We offer insights that we haven't yet had in this domain proving to +which extent these algorithms solve the fine-grained tasks in Insecta. We found +that the ViT performs the best on inference speed and computational cost while +the LBVT outperforms the others on performance and embedding quality; the CNN +provide a trade-off among the metrics. + +
+
+
+
+
+ + ☆ You Only Scan Once: A Dynamic Scene Reconstruction Pipeline for 6-DoF + Robotic Grasping of Novel Objects ICRA 2024 + + +
+ In the realm of robotic grasping, achieving accurate and reliable +interactions with the environment is a pivotal challenge. Traditional methods +of grasp planning methods utilizing partial point clouds derived from depth +image often suffer from reduced scene understanding due to occlusion, +ultimately impeding their grasping accuracy. Furthermore, scene reconstruction +methods have primarily relied upon static techniques, which are susceptible to +environment change during manipulation process limits their efficacy in +real-time grasping tasks. To address these limitations, this paper introduces a +novel two-stage pipeline for dynamic scene reconstruction. In the first stage, +our approach takes scene scanning as input to register each target object with +mesh reconstruction and novel object pose tracking. In the second stage, pose +tracking is still performed to provide object poses in real-time, enabling our +approach to transform the reconstructed object point clouds back into the +scene. Unlike conventional methodologies, which rely on static scene snapshots, +our method continuously captures the evolving scene geometry, resulting in a +comprehensive and up-to-date point cloud representation. By circumventing the +constraints posed by occlusion, our method enhances the overall grasp planning +process and empowers state-of-the-art 6-DoF robotic grasping algorithms to +exhibit markedly improved accuracy. + +
+
+ comment: ICRA 2024 +
+
+
+
+
+ + ☆ How Much Data are Enough? Investigating Dataset Requirements for + Patch-Based Brain MRI Segmentation Tasks + + +
+ Training deep neural networks reliably requires access to large-scale +datasets. However, obtaining such datasets can be challenging, especially in +the context of neuroimaging analysis tasks, where the cost associated with +image acquisition and annotation can be prohibitive. To mitigate both the time +and financial costs associated with model development, a clear understanding of +the amount of data required to train a satisfactory model is crucial. This +paper focuses on an early stage phase of deep learning research, prior to model +development, and proposes a strategic framework for estimating the amount of +annotated data required to train patch-based segmentation networks. This +framework includes the establishment of performance expectations using a novel +Minor Boundary Adjustment for Threshold (MinBAT) method, and standardizing +patch selection through the ROI-based Expanded Patch Selection (REPS) method. +Our experiments demonstrate that tasks involving regions of interest (ROIs) +with different sizes or shapes may yield variably acceptable Dice Similarity +Coefficient (DSC) scores. By setting an acceptable DSC as the target, the +required amount of training data can be estimated and even predicted as data +accumulates. This approach could assist researchers and engineers in estimating +the cost associated with data collection and annotation when defining a new +segmentation task based on deep neural networks, ultimately contributing to +their efficient translation to real-world applications. + +
+
+
+
+
+ + ☆ SP$^2$OT: Semantic-Regularized Progressive Partial Optimal Transport for + Imbalanced Clustering + + +
+ Deep clustering, which learns representation and semantic clustering without +labels information, poses a great challenge for deep learning-based approaches. +Despite significant progress in recent years, most existing methods focus on +uniformly distributed datasets, significantly limiting the practical +applicability of their methods. In this paper, we propose a more practical +problem setting named deep imbalanced clustering, where the underlying classes +exhibit an imbalance distribution. To address this challenge, we introduce a +novel optimal transport-based pseudo-label learning framework. Our framework +formulates pseudo-label generation as a Semantic-regularized Progressive +Partial Optimal Transport (SP$^2$OT) problem, which progressively transports +each sample to imbalanced clusters under several prior distribution and +semantic relation constraints, thus generating high-quality and imbalance-aware +pseudo-labels. To solve SP$^2$OT, we develop a Majorization-Minimization-based +optimization algorithm. To be more precise, we employ the strategy of +majorization to reformulate the SP$^2$OT problem into a Progressive Partial +Optimal Transport problem, which can be transformed into an unbalanced optimal +transport problem with augmented constraints and can be solved efficiently by a +fast matrix scaling algorithm. Experiments on various datasets, including a +human-curated long-tailed CIFAR100, challenging ImageNet-R, and large-scale +subsets of fine-grained iNaturalist2018 datasets, demonstrate the superiority +of our method. + +
+
+ comment: under review. arXiv admin note: substantial text overlap with + arXiv:2401.09266 +
+
+
+
+
+ + ☆ Part-Attention Based Model Make Occluded Person Re-Identification + Stronger + + +
+ The goal of occluded person re-identification (ReID) is to retrieve specific +pedestrians in occluded situations. However, occluded person ReID still suffers +from background clutter and low-quality local feature representations, which +limits model performance. In our research, we introduce a new framework called +PAB-ReID, which is a novel ReID model incorporating part-attention mechanisms +to tackle the aforementioned issues effectively. Firstly, we introduce the +human parsing label to guide the generation of more accurate human part +attention maps. In addition, we propose a fine-grained feature focuser for +generating fine-grained human local feature representations while suppressing +background interference. Moreover, We also design a part triplet loss to +supervise the learning of human local features, which optimizes +intra/inter-class distance. We conducted extensive experiments on specialized +occlusion and regular ReID datasets, showcasing that our approach outperforms +the existing state-of-the-art methods. + +
+
+ comment: Accepted By International Joint Conference on Neural Networks +
+
+
+
+
+ + ☆ ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State + Space Model + + +
+ Convolutional neural networks (CNN) and Transformers have made impressive +progress in the field of remote sensing change detection (CD). However, both +architectures have their inherent shortcomings. Recently, the Mamba +architecture, based on spatial state models, has shown remarkable performance +in a series of natural language processing tasks, which can effectively +compensate for the shortcomings of the above two architectures. In this paper, +we explore for the first time the potential of the Mamba architecture for +remote sensing change detection tasks. We tailor the corresponding frameworks, +called MambaBCD, MambaSCD, and MambaBDA, for binary change detection (BCD), +semantic change detection (SCD), and building damage assessment (BDA), +respectively. All three frameworks adopt the cutting-edge visual Mamba +architecture as the encoder, which allows full learning of global spatial +contextual information from the input images. For the change decoder, which is +available in all three architectures, we propose three spatio-temporal +relationship modeling mechanisms, which can be naturally combined with the +Mamba architecture and fully utilize its attribute to achieve spatio-temporal +interaction of multi-temporal features and obtain accurate change information. +On five benchmark datasets, our proposed frameworks outperform current CNN- and +Transformer-based approaches without using any complex strategies or tricks, +fully demonstrating the potential of the Mamba architecture. Specifically, we +obtained 83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU, +LEVIR-CD+, and WHU-CD; on the SCD dataset SECOND, we obtained 24.04% SeK; and +on the xBD dataset, we obtained 81.41% overall F1 score. The source code will +be available in https://github.com/ChenHongruixuan/MambaCD + +
+
+
+
+
+ + ☆ Generalizable 3D Scene Reconstruction via Divide and Conquer from a + Single View + + +
+ Single-view 3D reconstruction is currently approached from two dominant +perspectives: reconstruction of scenes with limited diversity using 3D data +supervision or reconstruction of diverse singular objects using large image +priors. However, real-world scenarios are far more complex and exceed the +capabilities of these methods. We therefore propose a hybrid method following a +divide-and-conquer strategy. We first process the scene holistically, +extracting depth and semantic information, and then leverage a single-shot +object-level method for the detailed reconstruction of individual components. +By following a compositional processing approach, the overall framework +achieves full reconstruction of complex 3D scenes from a single image. We +purposely design our pipeline to be highly modular by carefully integrating +specific procedures for each processing step, without requiring an end-to-end +training of the whole system. This enables the pipeline to naturally improve as +future methods can replace the individual modules. We demonstrate the +reconstruction performance of our approach on both synthetic and real-world +scenes, comparing favorable against prior works. Project page: +https://andreeadogaru.github.io/Gen3DSR. + +
+
+
+
+
+ + ☆ NMF-Based Analysis of Mobile Eye-Tracking Data + + +
+ The depiction of scanpaths from mobile eye-tracking recordings by thumbnails +from the stimulus allows the application of visual computing to detect areas of +interest in an unsupervised way. We suggest using nonnegative matrix +factorization (NMF) to identify such areas in stimuli. For a user-defined +integer k, NMF produces an explainable decomposition into k components, each +consisting of a spatial representation associated with a temporal indicator. In +the context of multiple eye-tracking recordings, this leads to k spatial +representations, where the temporal indicator highlights the appearance within +recordings. The choice of k provides an opportunity to control the refinement +of the decomposition, i.e., the number of areas to detect. We combine our +NMF-based approach with visualization techniques to enable an exploratory +analysis of multiple recordings. Finally, we demonstrate the usefulness of our +approach with mobile eye-tracking data of an art gallery. + +
+
+
+
+
+ + ☆ Future Predictive Success-or-Failure Classification for Long-Horizon + Robotic Tasks + + +
+ Automating long-horizon tasks with a robotic arm has been a central research +topic in robotics. Optimization-based action planning is an efficient approach +for creating an action plan to complete a given task. Construction of a +reliable planning method requires a design process of conditions, e.g., to +avoid collision between objects. The design process, however, has two critical +issues: 1) iterative trials--the design process is time-consuming due to the +trial-and-error process of modifying conditions, and 2) manual redesign--it is +difficult to cover all the necessary conditions manually. To tackle these +issues, this paper proposes a future-predictive +success-or-failure-classification method to obtain conditions automatically. +The key idea behind the proposed method is an end-to-end approach for +determining whether the action plan can complete a given task instead of +manually redesigning the conditions. The proposed method uses a long-horizon +future-prediction method to enable success-or-failure classification without +the execution of an action plan. This paper also proposes a regularization term +called transition consistency regularization to provide easy-to-predict feature +distribution. The regularization term improves future prediction and +classification performance. The effectiveness of our method is demonstrated +through classification and robotic-manipulation experiments. + +
+
+ comment: IJCNN 2024 +
+
+
+
+
+ + ☆ MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding with + Interleaved Visual-Textual Tokens + + +
+ This paper introduces MiniGPT4-Video, a multimodal Large Language Model (LLM) +designed specifically for video understanding. The model is capable of +processing both temporal visual and textual data, making it adept at +understanding the complexities of videos. Building upon the success of +MiniGPT-v2, which excelled in translating visual features into the LLM space +for single images and achieved impressive results on various image-text +benchmarks, this paper extends the model's capabilities to process a sequence +of frames, enabling it to comprehend videos. MiniGPT4-video does not only +consider visual content but also incorporates textual conversations, allowing +the model to effectively answer queries involving both visual and text +components. The proposed model outperforms existing state-of-the-art methods, +registering gains of 4.22%, 1.13%, 20.82%, and 13.1% on the MSVD, MSRVTT, TGIF, +and TVQA benchmarks respectively. Our models and code have been made publicly +available here https://vision-cair.github.io/MiniGPT4-video/ + +
+
+ comment: 6 pages,8 figures +
+
+
+
+
+ + ☆ AIGIQA-20K: A Large Database for AI-Generated Image Quality Assessment + + +
+ With the rapid advancements in AI-Generated Content (AIGC), AI-Generated +Images (AIGIs) have been widely applied in entertainment, education, and social +media. However, due to the significant variance in quality among different +AIGIs, there is an urgent need for models that consistently match human +subjective ratings. To address this issue, we organized a challenge towards +AIGC quality assessment on NTIRE 2024 that extensively considers 15 popular +generative models, utilizing dynamic hyper-parameters (including +classifier-free guidance, iteration epochs, and output image resolution), and +gather subjective scores that consider perceptual quality and text-to-image +alignment altogether comprehensively involving 21 subjects. This approach +culminates in the creation of the largest fine-grained AIGI subjective quality +database to date with 20,000 AIGIs and 420,000 subjective ratings, known as +AIGIQA-20K. Furthermore, we conduct benchmark experiments on this database to +assess the correspondence between 16 mainstream AIGI quality models and human +perception. We anticipate that this large-scale quality database will inspire +robust quality indicators for AIGIs and propel the evolution of AIGC for +vision. The database is released on +https://www.modelscope.cn/datasets/lcysyzxdxc/AIGCQA-30K-Image. + +
+
+
+
+
+ + ☆ Scaling Up Video Summarization Pretraining with Large Language Models CVPR 2024 + + +
+ Long-form video content constitutes a significant portion of internet +traffic, making automated video summarization an essential research problem. +However, existing video summarization datasets are notably limited in their +size, constraining the effectiveness of state-of-the-art methods for +generalization. Our work aims to overcome this limitation by capitalizing on +the abundance of long-form videos with dense speech-to-video alignment and the +remarkable capabilities of recent large language models (LLMs) in summarizing +long text. We introduce an automated and scalable pipeline for generating a +large-scale video summarization dataset using LLMs as Oracle summarizers. By +leveraging the generated dataset, we analyze the limitations of existing +approaches and propose a new video summarization model that effectively +addresses them. To facilitate further research in the field, our work also +presents a new benchmark dataset that contains 1200 long videos each with +high-quality summaries annotated by professionals. Extensive experiments +clearly indicate that our proposed approach sets a new state-of-the-art in +video summarization across several benchmarks. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Background Noise Reduction of Attention Map for Weakly Supervised + Semantic Segmentation + + +
+ In weakly-supervised semantic segmentation (WSSS) using only image-level +class labels, a problem with CNN-based Class Activation Maps (CAM) is that they +tend to activate the most discriminative local regions of objects. On the other +hand, methods based on Transformers learn global features but suffer from the +issue of background noise contamination. This paper focuses on addressing the +issue of background noise in attention weights within the existing WSSS method +based on Conformer, known as TransCAM. The proposed method successfully reduces +background noise, leading to improved accuracy of pseudo labels. Experimental +results demonstrate that our model achieves segmentation performance of 70.5% +on the PASCAL VOC 2012 validation data, 71.1% on the test data, and 45.9% on MS +COCO 2014 data, outperforming TransCAM in terms of segmentation performance. + +
+
+
+
+
+ + ☆ Two Tricks to Improve Unsupervised Segmentation Learning + + +
+ We present two practical improvement techniques for unsupervised segmentation +learning. These techniques address limitations in the resolution and accuracy +of predicted segmentation maps of recent state-of-the-art methods. Firstly, we +leverage image post-processing techniques such as guided filtering to refine +the output masks, improving accuracy while avoiding substantial computational +costs. Secondly, we introduce a multi-scale consistency criterion, based on a +teacher-student training scheme. This criterion matches segmentation masks +predicted from regions of the input image extracted at different resolutions to +each other. Experimental results on several benchmarks used in unsupervised +segmentation learning demonstrate the effectiveness of our proposed techniques. + +
+
+
+
+
+ + ☆ LongVLM: Efficient Long Video Understanding via Large Language Models + + +
+ Empowered by Large Language Models (LLMs), recent advancements in VideoLLMs +have driven progress in various video understanding tasks. These models encode +video representations through pooling or query aggregation over a vast number +of visual tokens, making computational and memory costs affordable. Despite +successfully providing an overall comprehension of video content, existing +VideoLLMs still face challenges in achieving detailed understanding in videos +due to overlooking local information in long-term videos. To tackle this +challenge, we introduce LongVLM, a straightforward yet powerful VideoLLM for +long video understanding, building upon the observation that long videos often +consist of sequential key events, complex actions, and camera movements. Our +approach proposes to decompose long videos into multiple short-term segments +and encode local features for each local segment via a hierarchical token +merging module. These features are concatenated in temporal order to maintain +the storyline across sequential short-term segments. Additionally, we propose +to integrate global semantics into each local feature to enhance context +understanding. In this way, we encode video representations that incorporate +both local and global information, enabling the LLM to generate comprehensive +responses for long-term videos. Experimental results on the VideoChatGPT +benchmark and zero-shot video question-answering datasets demonstrate the +superior capabilities of our model over the previous state-of-the-art methods. +Qualitative examples demonstrate that our model produces more precise responses +for long videos understanding. Code is available at +\url{https://github.com/ziplab/LongVLM}. + +
+
+
+
+
+ + ☆ VF-NeRF: Viewshed Fields for Rigid NeRF Registration + + +
+ 3D scene registration is a fundamental problem in computer vision that seeks +the best 6-DoF alignment between two scenes. This problem was extensively +investigated in the case of point clouds and meshes, but there has been +relatively limited work regarding Neural Radiance Fields (NeRF). In this paper, +we consider the problem of rigid registration between two NeRFs when the +position of the original cameras is not given. Our key novelty is the +introduction of Viewshed Fields (VF), an implicit function that determines, for +each 3D point, how likely it is to be viewed by the original cameras. We +demonstrate how VF can help in the various stages of NeRF registration, with an +extensive evaluation showing that VF-NeRF achieves SOTA results on various +datasets with different capturing approaches such as LLFF and Objaverese. + +
+
+
+
+
+ + ☆ Meta Invariance Defense Towards Generalizable Robustness to Unknown + Adversarial Attacks TPAMI + + +
+ Despite providing high-performance solutions for computer vision tasks, the +deep neural network (DNN) model has been proved to be extremely vulnerable to +adversarial attacks. Current defense mainly focuses on the known attacks, but +the adversarial robustness to the unknown attacks is seriously overlooked. +Besides, commonly used adaptive learning and fine-tuning technique is +unsuitable for adversarial defense since it is essentially a zero-shot problem +when deployed. Thus, to tackle this challenge, we propose an attack-agnostic +defense method named Meta Invariance Defense (MID). Specifically, various +combinations of adversarial attacks are randomly sampled from a manually +constructed Attacker Pool to constitute different defense tasks against unknown +attacks, in which a student encoder is supervised by multi-consistency +distillation to learn the attack-invariant features via a meta principle. The +proposed MID has two merits: 1) Full distillation from pixel-, feature- and +prediction-level between benign and adversarial samples facilitates the +discovery of attack-invariance. 2) The model simultaneously achieves robustness +to the imperceptible adversarial perturbations in high-level image +classification and attack-suppression in low-level robust image regeneration. +Theoretical and empirical studies on numerous benchmarks such as ImageNet +verify the generalizable robustness and superiority of MID under various +attacks. + +
+
+ comment: Accepted by IEEE TPAMI in 2024 +
+
+
+
+
+ + ☆ DI-Retinex: Digital-Imaging Retinex Theory for Low-Light Image + Enhancement + + +
+ Many existing methods for low-light image enhancement (LLIE) based on Retinex +theory ignore important factors that affect the validity of this theory in +digital imaging, such as noise, quantization error, non-linearity, and dynamic +range overflow. In this paper, we propose a new expression called +Digital-Imaging Retinex theory (DI-Retinex) through theoretical and +experimental analysis of Retinex theory in digital imaging. Our new expression +includes an offset term in the enhancement model, which allows for pixel-wise +brightness contrast adjustment with a non-linear mapping function. In addition, +to solve the lowlight enhancement problem in an unsupervised manner, we propose +an image-adaptive masked reverse degradation loss in Gamma space. We also +design a variance suppression loss for regulating the additional offset term. +Extensive experiments show that our proposed method outperforms all existing +unsupervised methods in terms of visual quality, model size, and speed. Our +algorithm can also assist downstream face detectors in low-light, as it shows +the most performance gain after the low-light enhancement compared to other +methods. + +
+
+
+
+
+ + ☆ Sparse Concept Bottleneck Models: Gumbel Tricks in Contrastive Learning + + +
+ We propose a novel architecture and method of explainable classification with +Concept Bottleneck Models (CBMs). While SOTA approaches to Image Classification +task work as a black box, there is a growing demand for models that would +provide interpreted results. Such a models often learn to predict the +distribution over class labels using additional description of this target +instances, called concepts. However, existing Bottleneck methods have a number +of limitations: their accuracy is lower than that of a standard model and CBMs +require an additional set of concepts to leverage. We provide a framework for +creating Concept Bottleneck Model from pre-trained multi-modal encoder and new +CLIP-like architectures. By introducing a new type of layers known as Concept +Bottleneck Layers, we outline three methods for training them: with +$\ell_1$-loss, contrastive loss and loss function based on Gumbel-Softmax +distribution (Sparse-CBM), while final FC layer is still trained with +Cross-Entropy. We show a significant increase in accuracy using sparse hidden +layers in CLIP-based bottleneck models. Which means that sparse representation +of concepts activation vector is meaningful in Concept Bottleneck Models. +Moreover, with our Concept Matrix Search algorithm we can improve CLIP +predictions on complex datasets without any additional training or fine-tuning. +The code is available at: https://github.com/Andron00e/SparseCBM. + +
+
+ comment: 23 pages, 1 algorithm, 36 figures +
+
+
+
+
+ + ☆ AdaBM: On-the-Fly Adaptive Bit Mapping for Image Super-Resolution CVPR 2024 + + +
+ Although image super-resolution (SR) problem has experienced unprecedented +restoration accuracy with deep neural networks, it has yet limited versatile +applications due to the substantial computational costs. Since different input +images for SR face different restoration difficulties, adapting computational +costs based on the input image, referred to as adaptive inference, has emerged +as a promising solution to compress SR networks. Specifically, adapting the +quantization bit-widths has successfully reduced the inference and memory cost +without sacrificing the accuracy. However, despite the benefits of the +resultant adaptive network, existing works rely on time-intensive +quantization-aware training with full access to the original training pairs to +learn the appropriate bit allocation policies, which limits its ubiquitous +usage. To this end, we introduce the first on-the-fly adaptive quantization +framework that accelerates the processing time from hours to seconds. We +formulate the bit allocation problem with only two bit mapping modules: one to +map the input image to the image-wise bit adaptation factor and one to obtain +the layer-wise adaptation factors. These bit mappings are calibrated and +fine-tuned using only a small number of calibration images. We achieve +competitive performance with the previous adaptive quantization methods, while +the processing time is accelerated by x2000. Codes are available at +https://github.com/Cheeun/AdaBM. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Design and Development of a Framework For Stroke-Based Handwritten + Gujarati Font Generation + + +
+ Handwritten font generation is important for preserving cultural heritage and +creating personalized designs. It adds an authentic and expressive touch to +printed materials, making them visually appealing and establishing a stronger +connection with the audience. This paper aims to design a framework for +generating handwritten fonts in the Gujarati script, mimicking the variation of +human handwriting. The proposed font generation model consists of a learning +phase and a generation phase. In the learning phase, Gujarati scripts are +analyzed, and rules for designing each character are formulated. This ruleset +involves the concatenation of strokes in a stroke-based manner, ensuring visual +consistency in the resulting glyphs. The generation phase involves the user +providing a small subset of characters, and the system automatically generates +the remaining character glyphs based on extracted strokes and learned rules, +resulting in handwritten Gujarati fonts. The resulting character glyphs are +converted into an open-type font using the FontForge tool, making them +compatible with any Gujarati editor. Both subjective and objective evaluations +are conducted to assess the synthesized images and fonts. Subjective evaluation +through user studies provides feedback on quality and visual appeal, achieving +an overall accuracy of 84.84%. Notably, eleven characters demonstrated a +success ratio above 90%. Objective evaluation using an existing recognition +system achieves an overall accuracy of 84.28% in OCR evaluation. Notably, +fifteen characters had a success ratio of 80% or higher. + +
+
+ comment: 13 pages, 2 column, 12 figures +
+
+
+
+
+ + ☆ Multi Positive Contrastive Learning with Pose-Consistent Generated + Images + + +
+ Model pre-training has become essential in various recognition tasks. +Meanwhile, with the remarkable advancements in image generation models, +pre-training methods utilizing generated images have also emerged given their +ability to produce unlimited training data. However, while existing methods +utilizing generated images excel in classification, they fall short in more +practical tasks, such as human pose estimation. In this paper, we have +experimentally demonstrated it and propose the generation of visually distinct +images with identical human poses. We then propose a novel multi-positive +contrastive learning, which optimally utilize the previously generated images +to learn structural features of the human body. We term the entire learning +pipeline as GenPoCCL. Despite using only less than 1% amount of data compared +to current state-of-the-art method, GenPoCCL captures structural features of +the human body more effectively, surpassing existing methods in a variety of +human-centric perception tasks. + +
+
+
+
+
+ + ☆ A dataset of primary nasopharyngeal carcinoma MRI with multi-modalities + segmentation + + +
+ Multi-modality magnetic resonance imaging data with various sequences +facilitate the early diagnosis, tumor segmentation, and disease staging in the +management of nasopharyngeal carcinoma (NPC). The lack of publicly available, +comprehensive datasets limits advancements in diagnosis, treatment planning, +and the development of machine learning algorithms for NPC. Addressing this +critical need, we introduce the first comprehensive NPC MRI dataset, +encompassing MR axial imaging of 277 primary NPC patients. This dataset +includes T1-weighted, T2-weighted, and contrast-enhanced T1-weighted sequences, +totaling 831 scans. In addition to the corresponding clinical data, manually +annotated and labeled segmentations by experienced radiologists offer +high-quality data resources from untreated primary NPC. + +
+
+
+
+
+ + ☆ Real-time Noise Source Estimation of a Camera System from an Image and + Metadata + + +
+ Autonomous machines must self-maintain proper functionality to ensure the +safety of humans and themselves. This pertains particularly to its cameras as +predominant sensors to perceive the environment and support actions. A +fundamental camera problem addressed in this study is noise. Solutions often +focus on denoising images a posteriori, that is, fighting symptoms rather than +root causes. However, tackling root causes requires identifying the noise +sources, considering the limitations of mobile platforms. This work +investigates a real-time, memory-efficient and reliable noise source estimator +that combines data- and physically-based models. To this end, a DNN that +examines an image with camera metadata for major camera noise sources is built +and trained. In addition, it quantifies unexpected factors that impact image +noise or metadata. This study investigates seven different estimators on six +datasets that include synthetic noise, real-world noise from two camera +systems, and real field campaigns. For these, only the model with most metadata +is capable to accurately and robustly quantify all individual noise +contributions. This method outperforms total image noise estimators and can be +plug-and-play deployed. It also serves as a basis to include more advanced +noise sources, or as part of an automatic countermeasure feedback-loop to +approach fully reliable machines. + +
+
+ comment: 16 pages, 16 figures, 12 tables, Project page: + https://github.com/MaikWischow/Noise-Source-Estimation +
+
+
+
+
+ + ☆ Learning Transferable Negative Prompts for Out-of-Distribution Detection CVPR 2024 + + +
+ Existing prompt learning methods have shown certain capabilities in +Out-of-Distribution (OOD) detection, but the lack of OOD images in the target +dataset in their training can lead to mismatches between OOD images and +In-Distribution (ID) categories, resulting in a high false positive rate. To +address this issue, we introduce a novel OOD detection method, named +'NegPrompt', to learn a set of negative prompts, each representing a negative +connotation of a given class label, for delineating the boundaries between ID +and OOD images. It learns such negative prompts with ID data only, without any +reliance on external outlier data. Further, current methods assume the +availability of samples of all ID classes, rendering them ineffective in +open-vocabulary learning scenarios where the inference stage can contain novel +ID classes not present during training. In contrast, our learned negative +prompts are transferable to novel class labels. Experiments on various ImageNet +benchmarks show that NegPrompt surpasses state-of-the-art prompt-learning-based +OOD detection methods and maintains a consistent lead in hard OOD detection in +closed- and open-vocabulary classification scenarios. Code is available at +https://github.com/mala-lab/negprompt. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Would Deep Generative Models Amplify Bias in Future Models? CVPR 2024 + + +
+ We investigate the impact of deep generative models on potential social +biases in upcoming computer vision models. As the internet witnesses an +increasing influx of AI-generated images, concerns arise regarding inherent +biases that may accompany them, potentially leading to the dissemination of +harmful content. This paper explores whether a detrimental feedback loop, +resulting in bias amplification, would occur if generated images were used as +the training data for future models. We conduct simulations by progressively +substituting original images in COCO and CC3M datasets with images generated +through Stable Diffusion. The modified datasets are used to train OpenCLIP and +image captioning models, which we evaluate in terms of quality and bias. +Contrary to expectations, our findings indicate that introducing generated +images during training does not uniformly amplify bias. Instead, instances of +bias mitigation across specific tasks are observed. We further explore the +factors that may influence these phenomena, such as artifacts in image +generation (e.g., blurry faces) or pre-existing biases in the original +datasets. + +
+
+ comment: This paper has been accepted to CVPR 2024 +
+
+
+
+
+ + ☆ FACTUAL: A Novel Framework for Contrastive Learning Based Robust SAR + Image Classification + + +
+ Deep Learning (DL) Models for Synthetic Aperture Radar (SAR) Automatic Target +Recognition (ATR), while delivering improved performance, have been shown to be +quite vulnerable to adversarial attacks. Existing works improve robustness by +training models on adversarial samples. However, by focusing mostly on attacks +that manipulate images randomly, they neglect the real-world feasibility of +such attacks. In this paper, we propose FACTUAL, a novel Contrastive Learning +framework for Adversarial Training and robust SAR classification. FACTUAL +consists of two components: (1) Differing from existing works, a novel +perturbation scheme that incorporates realistic physical adversarial attacks +(such as OTSA) to build a supervised adversarial pre-training network. This +network utilizes class labels for clustering clean and perturbed images +together into a more informative feature space. (2) A linear classifier +cascaded after the encoder to use the computed representations to predict the +target labels. By pre-training and fine-tuning our model on both clean and +adversarial samples, we show that our model achieves high prediction accuracy +on both cases. Our model achieves 99.7% accuracy on clean samples, and 89.6% on +perturbed samples, both outperforming previous state-of-the-art methods. + +
+
+ comment: 2024 IEEE Radar Conference +
+
+
+
+
+ + ☆ iSeg: Interactive 3D Segmentation via Interactive Attention + + +
+ We present iSeg, a new interactive technique for segmenting 3D shapes. +Previous works have focused mainly on leveraging pre-trained 2D foundation +models for 3D segmentation based on text. However, text may be insufficient for +accurately describing fine-grained spatial segmentations. Moreover, achieving a +consistent 3D segmentation using a 2D model is challenging since occluded areas +of the same semantic region may not be visible together from any 2D view. Thus, +we design a segmentation method conditioned on fine user clicks, which operates +entirely in 3D. Our system accepts user clicks directly on the shape's surface, +indicating the inclusion or exclusion of regions from the desired shape +partition. To accommodate various click settings, we propose a novel +interactive attention module capable of processing different numbers and types +of clicks, enabling the training of a single unified interactive segmentation +model. We apply iSeg to a myriad of shapes from different domains, +demonstrating its versatility and faithfulness to the user's specifications. +Our project page is at https://threedle.github.io/iSeg/. + +
+
+ comment: Project page: https://threedle.github.io/iSeg/ +
+
+
+
+
+ + ☆ LeGrad: An Explainability Method for Vision Transformers via Feature + Formation Sensitivity + + +
+ Vision Transformers (ViTs), with their ability to model long-range +dependencies through self-attention mechanisms, have become a standard +architecture in computer vision. However, the interpretability of these models +remains a challenge. To address this, we propose LeGrad, an explainability +method specifically designed for ViTs. LeGrad computes the gradient with +respect to the attention maps of ViT layers, considering the gradient itself as +the explainability signal. We aggregate the signal over all layers, combining +the activations of the last as well as intermediate tokens to produce the +merged explainability map. This makes LeGrad a conceptually simple and an +easy-to-implement tool for enhancing the transparency of ViTs. We evaluate +LeGrad in challenging segmentation, perturbation, and open-vocabulary settings, +showcasing its versatility compared to other SotA explainability methods +demonstrating its superior spatial fidelity and robustness to perturbations. A +demo and the code is available at https://github.com/WalBouss/LeGrad. + +
+
+ comment: Code available at https://github.com/WalBouss/LeGrad +
+
+
+
+
+ + ☆ HDR Imaging for Dynamic Scenes with Events + + +
+ High dynamic range imaging (HDRI) for real-world dynamic scenes is +challenging because moving objects may lead to hybrid degradation of low +dynamic range and motion blur. Existing event-based approaches only focus on a +separate task, while cascading HDRI and motion deblurring would lead to +sub-optimal solutions, and unavailable ground-truth sharp HDR images aggravate +the predicament. To address these challenges, we propose an Event-based HDRI +framework within a Self-supervised learning paradigm, i.e., Self-EHDRI, which +generalizes HDRI performance in real-world dynamic scenarios. Specifically, a +self-supervised learning strategy is carried out by learning cross-domain +conversions from blurry LDR images to sharp LDR images, which enables sharp HDR +images to be accessible in the intermediate process even though ground-truth +sharp HDR images are missing. Then, we formulate the event-based HDRI and +motion deblurring model and conduct a unified network to recover the +intermediate sharp HDR results, where both the high dynamic range and high +temporal resolution of events are leveraged simultaneously for compensation. We +construct large-scale synthetic and real-world datasets to evaluate the +effectiveness of our method. Comprehensive experiments demonstrate that the +proposed Self-EHDRI outperforms state-of-the-art approaches by a large margin. +The codes, datasets, and results are available at +https://lxp-whu.github.io/Self-EHDRI. + +
+
+
+
+
+ + ☆ OmniGS: Omnidirectional Gaussian Splatting for Fast Radiance Field + Reconstruction using Omnidirectional Images IROS 2024 + + +
+ Photorealistic reconstruction relying on 3D Gaussian Splatting has shown +promising potential in robotics. However, the current 3D Gaussian Splatting +system only supports radiance field reconstruction using undistorted +perspective images. In this paper, we present OmniGS, a novel omnidirectional +Gaussian splatting system, to take advantage of omnidirectional images for fast +radiance field reconstruction. Specifically, we conduct a theoretical analysis +of spherical camera model derivatives in 3D Gaussian Splatting. According to +the derivatives, we then implement a new GPU-accelerated omnidirectional +rasterizer that directly splats 3D Gaussians onto the equirectangular screen +space for omnidirectional image rendering. As a result, we realize +differentiable optimization of the radiance field without the requirement of +cube-map rectification or tangent-plane approximation. Extensive experiments +conducted in egocentric and roaming scenarios demonstrate that our method +achieves state-of-the-art reconstruction quality and high rendering speed using +omnidirectional images. To benefit the research community, the code will be +made publicly available once the paper is published. + +
+
+ comment: IROS 2024 submission, 7 pages, 4 figures +
+
+
+
+
+ + ☆ Future-Proofing Class Incremental Learning + + +
+ Exemplar-Free Class Incremental Learning is a highly challenging setting +where replay memory is unavailable. Methods relying on frozen feature +extractors have drawn attention recently in this setting due to their +impressive performances and lower computational costs. However, those methods +are highly dependent on the data used to train the feature extractor and may +struggle when an insufficient amount of classes are available during the first +incremental step. To overcome this limitation, we propose to use a pre-trained +text-to-image diffusion model in order to generate synthetic images of future +classes and use them to train the feature extractor. Experiments on the +standard benchmarks CIFAR100 and ImageNet-Subset demonstrate that our proposed +method can be used to improve state-of-the-art methods for exemplar-free class +incremental learning, especially in the most difficult settings where the first +incremental step only contains few classes. Moreover, we show that using +synthetic samples of future classes achieves higher performance than using real +data from different classes, paving the way for better and less costly +pre-training methods for incremental learning. + +
+
+
+
+
+ + ☆ CORP: A Multi-Modal Dataset for Campus-Oriented Roadside Perception + Tasks + + +
+ Numerous roadside perception datasets have been introduced to propel +advancements in autonomous driving and intelligent transportation systems +research and development. However, it has been observed that the majority of +their concentrates is on urban arterial roads, inadvertently overlooking +residential areas such as parks and campuses that exhibit entirely distinct +characteristics. In light of this gap, we propose CORP, which stands as the +first public benchmark dataset tailored for multi-modal roadside perception +tasks under campus scenarios. Collected in a university campus, CORP consists +of over 205k images plus 102k point clouds captured from 18 cameras and 9 LiDAR +sensors. These sensors with different configurations are mounted on roadside +utility poles to provide diverse viewpoints within the campus region. The +annotations of CORP encompass multi-dimensional information beyond 2D and 3D +bounding boxes, providing extra support for 3D seamless tracking and instance +segmentation with unique IDs and pixel masks for identifying targets, to +enhance the understanding of objects and their behaviors distributed across the +campus premises. Unlike other roadside datasets about urban traffic, CORP +extends the spectrum to highlight the challenges for multi-modal perception in +campuses and other residential areas. + +
+
+
+
+
+ + ☆ Adaptive Discrete Disparity Volume for Self-supervised Monocular Depth + Estimation + + +
+ In self-supervised monocular depth estimation tasks, discrete disparity +prediction has been proven to attain higher quality depth maps than common +continuous methods. However, current discretization strategies often divide +depth ranges of scenes into bins in a handcrafted and rigid manner, limiting +model performance. In this paper, we propose a learnable module, Adaptive +Discrete Disparity Volume (ADDV), which is capable of dynamically sensing depth +distributions in different RGB images and generating adaptive bins for them. +Without any extra supervision, this module can be integrated into existing CNN +architectures, allowing networks to produce representative values for bins and +a probability volume over them. Furthermore, we introduce novel training +strategies - uniformizing and sharpening - through a loss term and temperature +parameter, respectively, to provide regularizations under self-supervised +conditions, preventing model degradation or collapse. Empirical results +demonstrate that ADDV effectively processes global information, generating +appropriate bins for various scenes and producing higher quality depth maps +compared to handcrafted methods. + +
+
+
+
+
+ + ☆ Classification of Nasopharyngeal Cases using DenseNet Deep Learning + Architecture + + +
+ Nasopharyngeal carcinoma (NPC) is one of the understudied yet deadliest +cancers in South East Asia. In Malaysia, the prevalence is identified mainly in +Sarawak, among the ethnic of Bidayuh. NPC is often late-diagnosed because it is +asymptomatic at the early stage. There are several tissue representations from +the nasopharynx biopsy, such as nasopharyngeal inflammation (NPI), lymphoid +hyperplasia (LHP), nasopharyngeal carcinoma (NPC) and normal tissue. This paper +is our first initiative to identify the difference between NPC, NPI and normal +cases. Seven whole slide images (WSIs) with gigapixel resolutions from seven +different patients and two hospitals were experimented with using two test +setups, consisting of a different set of images. The tissue regions are patched +into smaller blocks and classified using DenseNet architecture with 21 dense +layers. Two tests are carried out, each for proof of concept (Test 1) and +real-test scenario (Test 2). The accuracy achieved for NPC class is 94.8% for +Test 1 and 67.0% for Test 2. + +
+
+ comment: This article has been accepted in the Journal of Engineering Science + and Technology (JESTEC) and awaiting publication +
+
+
+
+
+ + ☆ AGL-NET: Aerial-Ground Cross-Modal Global Localization with Varying + Scales + + +
+ We present AGL-NET, a novel learning-based method for global localization +using LiDAR point clouds and satellite maps. AGL-NET tackles two critical +challenges: bridging the representation gap between image and points modalities +for robust feature matching, and handling inherent scale discrepancies between +global view and local view. To address these challenges, AGL-NET leverages a +unified network architecture with a novel two-stage matching design. The first +stage extracts informative neural features directly from raw sensor data and +performs initial feature matching. The second stage refines this matching +process by extracting informative skeleton features and incorporating a novel +scale alignment step to rectify scale variations between LiDAR and map data. +Furthermore, a novel scale and skeleton loss function guides the network toward +learning scale-invariant feature representations, eliminating the need for +pre-processing satellite maps. This significantly improves real-world +applicability in scenarios with unknown map scales. To facilitate rigorous +performance evaluation, we introduce a meticulously designed dataset within the +CARLA simulator specifically tailored for metric localization training and +assessment. The code and dataset will be made publicly available. + +
+
+
+
+
+ + ☆ BodyMAP -- Jointly Predicting Body Mesh and 3D Applied Pressure Map for + People in Bed CVPR 2024 + + +
+ Accurately predicting the 3D human posture and the pressure exerted on the +body for people resting in bed, visualized as a body mesh (3D pose & shape) +with a 3D pressure map, holds significant promise for healthcare applications, +particularly, in the prevention of pressure ulcers. Current methods focus on +singular facets of the problem -- predicting only 2D/3D poses, generating 2D +pressure images, predicting pressure only for certain body regions instead of +the full body, or forming indirect approximations to the 3D pressure map. In +contrast, we introduce BodyMAP, which jointly predicts the human body mesh and +3D applied pressure map across the entire human body. Our network leverages +multiple visual modalities, incorporating both a depth image of a person in bed +and its corresponding 2D pressure image acquired from a pressure-sensing +mattress. The 3D pressure map is represented as a pressure value at each mesh +vertex and thus allows for precise localization of high-pressure regions on the +body. Additionally, we present BodyMAP-WS, a new formulation of pressure +prediction in which we implicitly learn pressure in 3D by aligning sensed 2D +pressure images with a differentiable 2D projection of the predicted 3D +pressure maps. In evaluations with real-world human data, our method +outperforms the current state-of-the-art technique by 25% on both body mesh and +3D applied pressure map prediction tasks for people in bed. + +
+
+ comment: Accepted at CVPR 2024 Project Website: https://bodymap3d.github.io/ + Code: https://github.com/RCHI-Lab/BodyMAP +
+
+
+
+
+ + ☆ MonoCD: Monocular 3D Object Detection with Complementary Depths CVPR 2024 + + +
+ Monocular 3D object detection has attracted widespread attention due to its +potential to accurately obtain object 3D localization from a single image at a +low cost. Depth estimation is an essential but challenging subtask of monocular +3D object detection due to the ill-posedness of 2D to 3D mapping. Many methods +explore multiple local depth clues such as object heights and keypoints and +then formulate the object depth estimation as an ensemble of multiple depth +predictions to mitigate the insufficiency of single-depth information. However, +the errors of existing multiple depths tend to have the same sign, which +hinders them from neutralizing each other and limits the overall accuracy of +combined depth. To alleviate this problem, we propose to increase the +complementarity of depths with two novel designs. First, we add a new depth +prediction branch named complementary depth that utilizes global and efficient +depth clues from the entire image rather than the local clues to reduce the +correlation of depth predictions. Second, we propose to fully exploit the +geometric relations between multiple depth clues to achieve complementarity in +form. Benefiting from these designs, our method achieves higher +complementarity. Experiments on the KITTI benchmark demonstrate that our method +achieves state-of-the-art performance without introducing extra data. In +addition, complementary depth can also be a lightweight and plug-and-play +module to boost multiple existing monocular 3d object detectors. Code is +available at https://github.com/elvintanhust/MonoCD. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + UniAV: Unified Audio-Visual Perception for Multi-Task Video Localization + + +
+ Video localization tasks aim to temporally locate specific instances in +videos, including temporal action localization (TAL), sound event detection +(SED) and audio-visual event localization (AVEL). Existing methods +over-specialize on each task, overlooking the fact that these instances often +occur in the same video to form the complete video content. In this work, we +present UniAV, a Unified Audio-Visual perception network, to achieve joint +learning of TAL, SED and AVEL tasks for the first time. UniAV can leverage +diverse data available in task-specific datasets, allowing the model to learn +and share mutually beneficial knowledge across tasks and modalities. To tackle +the challenges posed by substantial variations in datasets +(size/domain/duration) and distinct task characteristics, we propose to +uniformly encode visual and audio modalities of all videos to derive generic +representations, while also designing task-specific experts to capture unique +knowledge for each task. Besides, we develop a unified language-aware +classifier by utilizing a pre-trained text encoder, enabling the model to +flexibly detect various types of instances and previously unseen ones by simply +changing prompts during inference. UniAV outperforms its single-task +counterparts by a large margin with fewer parameters, achieving on-par or +superior performances compared to state-of-the-art task-specific methods across +ActivityNet 1.3, DESED and UnAV-100 benchmarks. + +
+
+
+
+
+ + ☆ BioVL-QR: Egocentric Biochemical Video-and-Language Dataset Using Micro + QR Codes + + +
+ This paper introduces a biochemical vision-and-language dataset, which +consists of 24 egocentric experiment videos, corresponding protocols, and +video-and-language alignments. The key challenge in the wet-lab domain is +detecting equipment, reagents, and containers is difficult because the lab +environment is scattered by filling objects on the table and some objects are +indistinguishable. Therefore, previous studies assume that objects are manually +annotated and given for downstream tasks, but this is costly and +time-consuming. To address this issue, this study focuses on Micro QR Codes to +detect objects automatically. From our preliminary study, we found that +detecting objects only using Micro QR Codes is still difficult because the +researchers manipulate objects, causing blur and occlusion frequently. To +address this, we also propose a novel object labeling method by combining a +Micro QR Code detector and an off-the-shelf hand object detector. As one of the +applications of our dataset, we conduct the task of generating protocols from +experiment videos and find that our approach can generate accurate protocols. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ HandDiff: 3D Hand Pose Estimation with Diffusion on Image-Point Cloud + + +
+ Extracting keypoint locations from input hand frames, known as 3D hand pose +estimation, is a critical task in various human-computer interaction +applications. Essentially, the 3D hand pose estimation can be regarded as a 3D +point subset generative problem conditioned on input frames. Thanks to the +recent significant progress on diffusion-based generative models, hand pose +estimation can also benefit from the diffusion model to estimate keypoint +locations with high quality. However, directly deploying the existing diffusion +models to solve hand pose estimation is non-trivial, since they cannot achieve +the complex permutation mapping and precise localization. Based on this +motivation, this paper proposes HandDiff, a diffusion-based hand pose +estimation model that iteratively denoises accurate hand pose conditioned on +hand-shaped image-point clouds. In order to recover keypoint permutation and +accurate location, we further introduce joint-wise condition and local detail +condition. Experimental results demonstrate that the proposed HandDiff +significantly outperforms the existing approaches on four challenging hand pose +benchmark datasets. Codes and pre-trained models are publicly available at +https://github.com/cwc1260/HandDiff. + +
+
+ comment: Accepted as a conference paper to the Conference on Computer Vision + and Pattern Recognition (2024) +
+
+
+
+
+ + ☆ DreamWalk: Style Space Exploration using Diffusion Guidance + + +
+ Text-conditioned diffusion models can generate impressive images, but fall +short when it comes to fine-grained control. Unlike direct-editing tools like +Photoshop, text conditioned models require the artist to perform "prompt +engineering," constructing special text sentences to control the style or +amount of a particular subject present in the output image. Our goal is to +provide fine-grained control over the style and substance specified by the +prompt, for example to adjust the intensity of styles in different regions of +the image (Figure 1). Our approach is to decompose the text prompt into +conceptual elements, and apply a separate guidance term for each element in a +single diffusion process. We introduce guidance scale functions to control when +in the diffusion process and \emph{where} in the image to intervene. Since the +method is based solely on adjusting diffusion guidance, it does not require +fine-tuning or manipulating the internal layers of the diffusion model's neural +network, and can be used in conjunction with LoRA- or DreamBooth-trained models +(Figure2). Project page: https://mshu1.github.io/dreamwalk.github.io/ + +
+
+
+
+
+ + ☆ Diverse and Tailored Image Generation for Zero-shot Multi-label + Classification + + +
+ Recently, zero-shot multi-label classification has garnered considerable +attention for its capacity to operate predictions on unseen labels without +human annotations. Nevertheless, prevailing approaches often use seen classes +as imperfect proxies for unseen ones, resulting in suboptimal performance. +Drawing inspiration from the success of text-to-image generation models in +producing realistic images, we propose an innovative solution: generating +synthetic data to construct a training set explicitly tailored for proxyless +training on unseen labels. Our approach introduces a novel image generation +framework that produces multi-label synthetic images of unseen classes for +classifier training. To enhance diversity in the generated images, we leverage +a pre-trained large language model to generate diverse prompts. Employing a +pre-trained multi-modal CLIP model as a discriminator, we assess whether the +generated images accurately represent the target classes. This enables +automatic filtering of inaccurately generated images, preserving classifier +accuracy. To refine text prompts for more precise and effective multi-label +object generation, we introduce a CLIP score-based discriminative loss to +fine-tune the text encoder in the diffusion model. Additionally, to enhance +visual features on the target task while maintaining the generalization of +original features and mitigating catastrophic forgetting resulting from +fine-tuning the entire visual encoder, we propose a feature fusion module +inspired by transformer attention mechanisms. This module aids in capturing +global dependencies between multiple objects more effectively. Extensive +experimental results validate the effectiveness of our approach, demonstrating +significant improvements over state-of-the-art methods. + +
+
+
+
+
+ + ☆ Discontinuity-preserving Normal Integration with Auxiliary Edges CVPR 2024 + + +
+ Many surface reconstruction methods incorporate normal integration, which is +a process to obtain a depth map from surface gradients. In this process, the +input may represent a surface with discontinuities, e.g., due to +self-occlusion. To reconstruct an accurate depth map from the input normal map, +hidden surface gradients occurring from the jumps must be handled. To model +these jumps correctly, we design a novel discretization scheme for the domain +of normal integration. Our key idea is to introduce auxiliary edges, which +bridge between piecewise-smooth patches in the domain so that the magnitude of +hidden jumps can be explicitly expressed. Using the auxiliary edges, we design +a novel algorithm to optimize the discontinuity and the depth map from the +input normal map. Our method optimizes discontinuities by using a combination +of iterative re-weighted least squares and iterative filtering of the jump +magnitudes on auxiliary edges to provide strong sparsity regularization. +Compared to previous discontinuity-preserving normal integration methods, which +model the magnitudes of jumps only implicitly, our method reconstructs subtle +discontinuities accurately thanks to our explicit representation of jumps +allowing for strong sparsity regularization. + +
+
+ comment: To appear at CVPR 2024. For supplementary video, see + https://youtu.be/MTTcW5kAOFE +
+
+
+
+
+ + ☆ GaSpCT: Gaussian Splatting for Novel CT Projection View Synthesis + + +
+ We present GaSpCT, a novel view synthesis and 3D scene representation method +used to generate novel projection views for Computer Tomography (CT) scans. We +adapt the Gaussian Splatting framework to enable novel view synthesis in CT +based on limited sets of 2D image projections and without the need for +Structure from Motion (SfM) methodologies. Therefore, we reduce the total +scanning duration and the amount of radiation dose the patient receives during +the scan. We adapted the loss function to our use-case by encouraging a +stronger background and foreground distinction using two sparsity promoting +regularizers: a beta loss and a total variation (TV) loss. Finally, we +initialize the Gaussian locations across the 3D space using a uniform prior +distribution of where the brain's positioning would be expected to be within +the field of view. We evaluate the performance of our model using brain CT +scans from the Parkinson's Progression Markers Initiative (PPMI) dataset and +demonstrate that the rendered novel views closely match the original projection +views of the simulated scan, and have better performance than other implicit 3D +scene representations methodologies. Furthermore, we empirically observe +reduced training time compared to neural network based image synthesis for +sparse-view CT image reconstruction. Finally, the memory requirements of the +Gaussian Splatting representations are reduced by 17% compared to the +equivalent voxel grid image representations. + +
+
+ comment: Under Review Process for MICCAI 2024 +
+
+
+
+
+ + ☆ PARIS3D: Reasoning-based 3D Part Segmentation Using Large Multimodal + Model + + +
+ Recent advancements in 3D perception systems have significantly improved +their ability to perform visual recognition tasks such as segmentation. +However, these systems still heavily rely on explicit human instruction to +identify target objects or categories, lacking the capability to actively +reason and comprehend implicit user intentions. We introduce a novel +segmentation task known as reasoning part segmentation for 3D objects, aiming +to output a segmentation mask based on complex and implicit textual queries +about specific parts of a 3D object. To facilitate evaluation and benchmarking, +we present a large 3D dataset comprising over 60k instructions paired with +corresponding ground-truth part segmentation annotations specifically curated +for reasoning-based 3D part segmentation. We propose a model that is capable of +segmenting parts of 3D objects based on implicit textual queries and generating +natural language explanations corresponding to 3D object segmentation requests. +Experiments show that our method achieves competitive performance to models +that use explicit queries, with the additional abilities to identify part +concepts, reason about them, and complement them with world knowledge. Our +source code, dataset, and trained models are available at +https://github.com/AmrinKareem/PARIS3D. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ SleepVST: Sleep Staging from Near-Infrared Video Signals using + Pre-Trained Transformers CVPR 2024 + + +
+ Advances in camera-based physiological monitoring have enabled the robust, +non-contact measurement of respiration and the cardiac pulse, which are known +to be indicative of the sleep stage. This has led to research into camera-based +sleep monitoring as a promising alternative to "gold-standard" polysomnography, +which is cumbersome, expensive to administer, and hence unsuitable for +longer-term clinical studies. In this paper, we introduce SleepVST, a +transformer model which enables state-of-the-art performance in camera-based +sleep stage classification (sleep staging). After pre-training on contact +sensor data, SleepVST outperforms existing methods for cardio-respiratory sleep +staging on the SHHS and MESA datasets, achieving total Cohen's kappa scores of +0.75 and 0.77 respectively. We then show that SleepVST can be successfully +transferred to cardio-respiratory waveforms extracted from video, enabling +fully contact-free sleep staging. Using a video dataset of 50 nights, we +achieve a total accuracy of 78.8\% and a Cohen's $\kappa$ of 0.71 in four-class +video-based sleep staging, setting a new state-of-the-art in the domain. + +
+
+ comment: CVPR 2024 Highlight Paper +
+
+
+
+
+ + ☆ Effective Lymph Nodes Detection in CT Scans Using Location Debiased + Query Selection and Contrastive Query Representation in Transformer + + +
+ Lymph node (LN) assessment is a critical, indispensable yet very challenging +task in the routine clinical workflow of radiology and oncology. Accurate LN +analysis is essential for cancer diagnosis, staging, and treatment planning. +Finding scatteredly distributed, low-contrast clinically relevant LNs in 3D CT +is difficult even for experienced physicians under high inter-observer +variations. Previous automatic LN detection works typically yield limited +recall and high false positives (FPs) due to adjacent anatomies with similar +image intensities, shapes, or textures (vessels, muscles, esophagus, etc). In +this work, we propose a new LN DEtection TRansformer, named LN-DETR, to achieve +more accurate performance. By enhancing the 2D backbone with a multi-scale 2.5D +feature fusion to incorporate 3D context explicitly, more importantly, we make +two main contributions to improve the representation quality of LN queries. 1) +Considering that LN boundaries are often unclear, an IoU prediction head and a +location debiased query selection are proposed to select LN queries of higher +localization accuracy as the decoder query's initialization. 2) To reduce FPs, +query contrastive learning is employed to explicitly reinforce LN queries +towards their best-matched ground-truth queries over unmatched query +predictions. Trained and tested on 3D CT scans of 1067 patients (with 10,000+ +labeled LNs) via combining seven LN datasets from different body parts (neck, +chest, and abdomen) and pathologies/cancers, our method significantly improves +the performance of previous leading methods by > 4-5% average recall at the +same FP rates in both internal and external testing. We further evaluate on the +universal lesion detection task using NIH DeepLesion benchmark, and our method +achieves the top performance of 88.46% averaged recall across 0.5 to 4 FPs per +image, compared with other leading reported results. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Language-Guided Instance-Aware Domain-Adaptive Panoptic Segmentation + + +
+ The increasing relevance of panoptic segmentation is tied to the advancements +in autonomous driving and AR/VR applications. However, the deployment of such +models has been limited due to the expensive nature of dense data annotation, +giving rise to unsupervised domain adaptation (UDA). A key challenge in +panoptic UDA is reducing the domain gap between a labeled source and an +unlabeled target domain while harmonizing the subtasks of semantic and instance +segmentation to limit catastrophic interference. While considerable progress +has been achieved, existing approaches mainly focus on the adaptation of +semantic segmentation. In this work, we focus on incorporating instance-level +adaptation via a novel instance-aware cross-domain mixing strategy IMix. IMix +significantly enhances the panoptic quality by improving instance segmentation +performance. Specifically, we propose inserting high-confidence predicted +instances from the target domain onto source images, retaining the +exhaustiveness of the resulting pseudo-labels while reducing the injected +confirmation bias. Nevertheless, such an enhancement comes at the cost of +degraded semantic performance, attributed to catastrophic forgetting. To +mitigate this issue, we regularize our semantic branch by employing CLIP-based +domain alignment (CDA), exploiting the domain-robustness of natural language +prompts. Finally, we present an end-to-end model incorporating these two +mechanisms called LIDAPS, achieving state-of-the-art results on all popular +panoptic UDA benchmarks. + +
+
+
+
+
+ + ☆ Quantifying Uncertainty in Motion Prediction with Variational Bayesian + Mixture CVPR 2024 + + +
+ Safety and robustness are crucial factors in developing trustworthy +autonomous vehicles. One essential aspect of addressing these factors is to +equip vehicles with the capability to predict future trajectories for all +moving objects in the surroundings and quantify prediction uncertainties. In +this paper, we propose the Sequential Neural Variational Agent (SeNeVA), a +generative model that describes the distribution of future trajectories for a +single moving object. Our approach can distinguish Out-of-Distribution data +while quantifying uncertainty and achieving competitive performance compared to +state-of-the-art methods on the Argoverse 2 and INTERACTION datasets. +Specifically, a 0.446 meters minimum Final Displacement Error, a 0.203 meters +minimum Average Displacement Error, and a 5.35% Miss Rate are achieved on the +INTERACTION test set. Extensive qualitative and quantitative analysis is also +provided to evaluate the proposed model. Our open-source code is available at +https://github.com/PurdueDigitalTwin/seneva. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Layerwise Early Stopping for Test Time Adaptation + + +
+ Test Time Adaptation (TTA) addresses the problem of distribution shift by +enabling pretrained models to learn new features on an unseen domain at test +time. However, it poses a significant challenge to maintain a balance between +learning new features and retaining useful pretrained features. In this paper, +we propose Layerwise EArly STopping (LEAST) for TTA to address this problem. +The key idea is to stop adapting individual layers during TTA if the features +being learned do not appear beneficial for the new domain. For that purpose, we +propose using a novel gradient-based metric to measure the relevance of the +current learnt features to the new domain without the need for supervised +labels. More specifically, we propose to use this metric to determine +dynamically when to stop updating each layer during TTA. This enables a more +balanced adaptation, restricted to layers benefiting from it, and only for a +certain number of steps. Such an approach also has the added effect of limiting +the forgetting of pretrained features useful for dealing with new domains. +Through extensive experiments, we demonstrate that Layerwise Early Stopping +improves the performance of existing TTA approaches across multiple datasets, +domain shifts, model architectures, and TTA losses. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Flattening the Parent Bias: Hierarchical Semantic Segmentation in the + Poincaré Ball + + +
+ Hierarchy is a natural representation of semantic taxonomies, including the +ones routinely used in image segmentation. Indeed, recent work on semantic +segmentation reports improved accuracy from supervised training leveraging +hierarchical label structures. Encouraged by these results, we revisit the +fundamental assumptions behind that work. We postulate and then empirically +verify that the reasons for the observed improvement in segmentation accuracy +may be entirely unrelated to the use of the semantic hierarchy. To demonstrate +this, we design a range of cross-domain experiments with a representative +hierarchical approach. We find that on the new testing domains, a flat +(non-hierarchical) segmentation network, in which the parents are inferred from +the children, has superior segmentation accuracy to the hierarchical approach +across the board. Complementing these findings and inspired by the intrinsic +properties of hyperbolic spaces, we study a more principled approach to +hierarchical segmentation using the Poincar\'e ball model. The hyperbolic +representation largely outperforms the previous (Euclidean) hierarchical +approach as well and is on par with our flat Euclidean baseline in terms of +segmentation accuracy. However, it additionally exhibits surprisingly strong +calibration quality of the parent nodes in the semantic hierarchy, especially +on the more challenging domains. Our combined analysis suggests that the +established practice of hierarchical segmentation may be limited to in-domain +settings, whereas flat classifiers generalize substantially better, especially +if they are modeled in the hyperbolic space. + +
+
+
+
+
+ + ☆ Data Science for Geographic Information Systems + + +
+ The integration of data science into Geographic Information Systems (GIS) has +facilitated the evolution of these tools into complete spatial analysis +platforms. The adoption of machine learning and big data techniques has +equipped these platforms with the capacity to handle larger amounts of +increasingly complex data, transcending the limitations of more traditional +approaches. This work traces the historical and technical evolution of data +science and GIS as fields of study, highlighting the critical points of +convergence between domains, and underlining the many sectors that rely on this +integration. A GIS application is presented as a case study in the disaster +management sector where we utilize aerial data from Tr\'oia, Portugal, to +emphasize the process of insight extraction from raw data. We conclude by +outlining prospects for future research in integration of these fields in +general, and the developed application in particular. + +
+
+
+
+
+ + ☆ Test Time Training for Industrial Anomaly Segmentation CVPR + + +
+ Anomaly Detection and Segmentation (AD&S) is crucial for industrial quality +control. While existing methods excel in generating anomaly scores for each +pixel, practical applications require producing a binary segmentation to +identify anomalies. Due to the absence of labeled anomalies in many real +scenarios, standard practices binarize these maps based on some statistics +derived from a validation set containing only nominal samples, resulting in +poor segmentation performance. This paper addresses this problem by proposing a +test time training strategy to improve the segmentation performance. Indeed, at +test time, we can extract rich features directly from anomalous samples to +train a classifier that can discriminate defects effectively. Our general +approach can work downstream to any AD&S method that provides an anomaly score +map as output, even in multimodal settings. We demonstrate the effectiveness of +our approach over baselines through extensive experimentation and evaluation on +MVTec AD and MVTec 3D-AD. + +
+
+ comment: Accepted at VAND 2.0, CVPRW 2024 +
+
+
+
+
+ + ☆ SC4D: Sparse-Controlled Video-to-4D Generation and Motion Transfer + + +
+ Recent advances in 2D/3D generative models enable the generation of dynamic +3D objects from a single-view video. Existing approaches utilize score +distillation sampling to form the dynamic scene as dynamic NeRF or dense 3D +Gaussians. However, these methods struggle to strike a balance among reference +view alignment, spatio-temporal consistency, and motion fidelity under +single-view conditions due to the implicit nature of NeRF or the intricate +dense Gaussian motion prediction. To address these issues, this paper proposes +an efficient, sparse-controlled video-to-4D framework named SC4D, that +decouples motion and appearance to achieve superior video-to-4D generation. +Moreover, we introduce Adaptive Gaussian (AG) initialization and Gaussian +Alignment (GA) loss to mitigate shape degeneration issue, ensuring the fidelity +of the learned motion and shape. Comprehensive experimental results demonstrate +that our method surpasses existing methods in both quality and efficiency. In +addition, facilitated by the disentangled modeling of motion and appearance of +SC4D, we devise a novel application that seamlessly transfers the learned +motion onto a diverse array of 4D entities according to textual descriptions. + +
+
+ comment: Project Page: https://sc4d.github.io/ +
+
+
+
+
+ + ☆ No "Zero-Shot" Without Exponential Data: Pretraining Concept Frequency + Determines Multimodal Model Performance ICLR'24 + + +
+ Web-crawled pretraining datasets underlie the impressive "zero-shot" +evaluation performance of multimodal models, such as CLIP for +classification/retrieval and Stable-Diffusion for image generation. However, it +is unclear how meaningful the notion of "zero-shot" generalization is for such +multimodal models, as it is not known to what extent their pretraining datasets +encompass the downstream concepts targeted for during "zero-shot" evaluation. +In this work, we ask: How is the performance of multimodal models on downstream +concepts influenced by the frequency of these concepts in their pretraining +datasets? We comprehensively investigate this question across 34 models and +five standard pretraining datasets (CC-3M, CC-12M, YFCC-15M, LAION-400M, +LAION-Aesthetics), generating over 300GB of data artifacts. We consistently +find that, far from exhibiting "zero-shot" generalization, multimodal models +require exponentially more data to achieve linear improvements in downstream +"zero-shot" performance, following a sample inefficient log-linear scaling +trend. This trend persists even when controlling for sample-level similarity +between pretraining and downstream datasets, and testing on purely synthetic +data distributions. Furthermore, upon benchmarking models on long-tailed data +sampled based on our analysis, we demonstrate that multimodal models across the +board perform poorly. We contribute this long-tail test set as the "Let it +Wag!" benchmark to further research in this direction. Taken together, our +study reveals an exponential need for training data which implies that the key +to "zero-shot" generalization capabilities under large-scale training paradigms +remains to be found. + +
+
+ comment: Extended version of the short paper accepted at DPFM, ICLR'24 +
+
+
+
+
+ + ☆ Explaining Explainability: Understanding Concept Activation Vectors + + +
+ Recent interpretability methods propose using concept-based explanations to +translate the internal representations of deep learning models into a language +that humans are familiar with: concepts. This requires understanding which +concepts are present in the representation space of a neural network. One +popular method for finding concepts is Concept Activation Vectors (CAVs), which +are learnt using a probe dataset of concept exemplars. In this work, we +investigate three properties of CAVs. CAVs may be: (1) inconsistent between +layers, (2) entangled with different concepts, and (3) spatially dependent. +Each property provides both challenges and opportunities in interpreting +models. We introduce tools designed to detect the presence of these properties, +provide insight into how they affect the derived explanations, and provide +recommendations to minimise their impact. Understanding these properties can be +used to our advantage. For example, we introduce spatially dependent CAVs to +test if a model is translation invariant with respect to a specific concept and +class. Our experiments are performed on ImageNet and a new synthetic dataset, +Elements. Elements is designed to capture a known ground truth relationship +between concepts and classes. We release this dataset to facilitate further +research in understanding and evaluating interpretability methods. + +
+
+ comment: (54 pages, 39 figures) +
+
+
+
+
+ + ♻ ☆ $CrowdDiff$: Multi-hypothesis Crowd Density Estimation using Diffusion + Models CVPR'24 + + +
+ Crowd counting is a fundamental problem in crowd analysis which is typically +accomplished by estimating a crowd density map and summing over the density +values. However, this approach suffers from background noise accumulation and +loss of density due to the use of broad Gaussian kernels to create the ground +truth density maps. This issue can be overcome by narrowing the Gaussian +kernel. However, existing approaches perform poorly when trained with ground +truth density maps with broad kernels. To deal with this limitation, we propose +using conditional diffusion models to predict density maps, as diffusion models +show high fidelity to training data during generation. With that, we present +$CrowdDiff$ that generates the crowd density map as a reverse diffusion +process. Furthermore, as the intermediate time steps of the diffusion process +are noisy, we incorporate a regression branch for direct crowd estimation only +during training to improve the feature learning. In addition, owing to the +stochastic nature of the diffusion model, we introduce producing multiple +density maps to improve the counting performance contrary to the existing crowd +counting pipelines. We conduct extensive experiments on publicly available +datasets to validate the effectiveness of our method. $CrowdDiff$ outperforms +existing state-of-the-art crowd counting methods on several public crowd +analysis benchmarks with significant improvements. + +
+
+ comment: Accepted at CVPR'24. The project is available at + https://dylran.github.io/crowddiff.github.io +
+
+
+
+
+ + ♻ ☆ Expressive Forecasting of 3D Whole-body Human Motions + + +
+ Human motion forecasting, with the goal of estimating future human behavior +over a period of time, is a fundamental task in many real-world applications. +However, existing works typically concentrate on predicting the major joints of +the human body without considering the delicate movements of the human hands. +In practical applications, hand gesture plays an important role in human +communication with the real world, and expresses the primary intention of human +beings. In this work, we are the first to formulate a whole-body human pose +forecasting task, which jointly predicts the future body and hand activities. +Correspondingly, we propose a novel Encoding-Alignment-Interaction (EAI) +framework that aims to predict both coarse (body joints) and fine-grained +(gestures) activities collaboratively, enabling expressive and +cross-facilitated forecasting of 3D whole-body human motions. Specifically, our +model involves two key constituents: cross-context alignment (XCA) and +cross-context interaction (XCI). Considering the heterogeneous information +within the whole-body, XCA aims to align the latent features of various human +components, while XCI focuses on effectively capturing the context interaction +among the human components. We conduct extensive experiments on a +newly-introduced large-scale benchmark and achieve state-of-the-art +performance. The code is public for research purposes at +https://github.com/Dingpx/EAI. + +
+
+ comment: Accepted by AAAI24 +
+
+
+
+
+ + ♻ ☆ Cameras as Rays: Pose Estimation via Ray Diffusion ICLR 2024 + + +
+ Estimating camera poses is a fundamental task for 3D reconstruction and +remains challenging given sparsely sampled views (<10). In contrast to existing +approaches that pursue top-down prediction of global parametrizations of camera +extrinsics, we propose a distributed representation of camera pose that treats +a camera as a bundle of rays. This representation allows for a tight coupling +with spatial image features improving pose precision. We observe that this +representation is naturally suited for set-level transformers and develop a +regression-based approach that maps image patches to corresponding rays. To +capture the inherent uncertainties in sparse-view pose inference, we adapt this +approach to learn a denoising diffusion model which allows us to sample +plausible modes while improving performance. Our proposed methods, both +regression- and diffusion-based, demonstrate state-of-the-art performance on +camera pose estimation on CO3D while generalizing to unseen object categories +and in-the-wild captures. + +
+
+ comment: In ICLR 2024 (oral). v2-3: updated references. Project webpage: + https://jasonyzhang.com/RayDiffusion +
+
+
+
+
+ + ♻ ☆ APISR: Anime Production Inspired Real-World Anime Super-Resolution + + +
+ While real-world anime super-resolution (SR) has gained increasing attention +in the SR community, existing methods still adopt techniques from the +photorealistic domain. In this paper, we analyze the anime production workflow +and rethink how to use characteristics of it for the sake of the real-world +anime SR. First, we argue that video networks and datasets are not necessary +for anime SR due to the repetition use of hand-drawing frames. Instead, we +propose an anime image collection pipeline by choosing the least compressed and +the most informative frames from the video sources. Based on this pipeline, we +introduce the Anime Production-oriented Image (API) dataset. In addition, we +identify two anime-specific challenges of distorted and faint hand-drawn lines +and unwanted color artifacts. We address the first issue by introducing a +prediction-oriented compression module in the image degradation model and a +pseudo-ground truth preparation with enhanced hand-drawn lines. In addition, we +introduce the balanced twin perceptual loss combining both anime and +photorealistic high-level features to mitigate unwanted color artifacts and +increase visual clarity. We evaluate our method through extensive experiments +on the public benchmark, showing our method outperforms state-of-the-art anime +dataset-trained approaches. + +
+
+
+
+
+ + ♻ ☆ NEMTO: Neural Environment Matting for Novel View and Relighting + Synthesis of Transparent Objects ICCV 2023 + + +
+ We propose NEMTO, the first end-to-end neural rendering pipeline to model 3D +transparent objects with complex geometry and unknown indices of refraction. +Commonly used appearance modeling such as the Disney BSDF model cannot +accurately address this challenging problem due to the complex light paths +bending through refractions and the strong dependency of surface appearance on +illumination. With 2D images of the transparent object as input, our method is +capable of high-quality novel view and relighting synthesis. We leverage +implicit Signed Distance Functions (SDF) to model the object geometry and +propose a refraction-aware ray bending network to model the effects of light +refraction within the object. Our ray bending network is more tolerant to +geometric inaccuracies than traditional physically-based methods for rendering +transparent objects. We provide extensive evaluations on both synthetic and +real-world datasets to demonstrate our high-quality synthesis and the +applicability of our method. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ 3DGS-Avatar: Animatable Avatars via Deformable 3D Gaussian Splatting + + +
+ We introduce an approach that creates animatable human avatars from monocular +videos using 3D Gaussian Splatting (3DGS). Existing methods based on neural +radiance fields (NeRFs) achieve high-quality novel-view/novel-pose image +synthesis but often require days of training, and are extremely slow at +inference time. Recently, the community has explored fast grid structures for +efficient training of clothed avatars. Albeit being extremely fast at training, +these methods can barely achieve an interactive rendering frame rate with +around 15 FPS. In this paper, we use 3D Gaussian Splatting and learn a +non-rigid deformation network to reconstruct animatable clothed human avatars +that can be trained within 30 minutes and rendered at real-time frame rates +(50+ FPS). Given the explicit nature of our representation, we further +introduce as-isometric-as-possible regularizations on both the Gaussian mean +vectors and the covariance matrices, enhancing the generalization of our model +on highly articulated unseen poses. Experimental results show that our method +achieves comparable and even better performance compared to state-of-the-art +approaches on animatable avatar creation from a monocular input, while being +400x and 250x faster in training and inference, respectively. + +
+
+ comment: Project page: https://neuralbodies.github.io/3DGS-Avatar +
+
+
+
+
+ + ♻ ☆ ILPO-NET: Network for the invariant recognition of arbitrary volumetric + patterns in 3D + + +
+ Effective recognition of spatial patterns and learning their hierarchy is +crucial in modern spatial data analysis. Volumetric data applications seek +techniques ensuring invariance not only to shifts but also to pattern +rotations. While traditional methods can readily achieve translational +invariance, rotational invariance possesses multiple challenges and remains an +active area of research. Here, we present ILPO-Net (Invariant to Local Patterns +Orientation Network), a novel approach that handles arbitrarily shaped patterns +with the convolutional operation inherently invariant to local spatial pattern +orientations using the Wigner matrix expansions. Our architecture seamlessly +integrates the new convolution operator and, when benchmarked on diverse +volumetric datasets such as MedMNIST and CATH, demonstrates superior +performance over the baselines with significantly reduced parameter counts - up +to 1000 times fewer in the case of MedMNIST. Beyond these demonstrations, +ILPO-Net's rotational invariance paves the way for other applications across +multiple disciplines. Our code is publicly available at +https://gricad-gitlab.univ-grenoble-alpes.fr/GruLab/ILPONet. + +
+
+
+
+
+ + ♻ ☆ Bootstrapping SparseFormers from Vision Foundation Models CVPR 2024 + + +
+ The recently proposed SparseFormer architecture provides an alternative +approach to visual understanding by utilizing a significantly lower number of +visual tokens via adjusting RoIs, greatly reducing computational costs while +still achieving promising performance. However, training SparseFormers from +scratch is still expensive, and scaling up the number of parameters can be +challenging. In this paper, we propose to bootstrap SparseFormers from +ViT-based vision foundation models in a simple and efficient way. Since the +majority of SparseFormer blocks are the standard transformer ones, we can +inherit weights from large-scale pre-trained vision transformers and freeze +them as much as possible. Therefore, we only need to train the +SparseFormer-specific lightweight focusing transformer to adjust token RoIs and +fine-tune a few early pre-trained blocks to align the final token +representation. In such a way, we can bootstrap SparseFormer architectures from +various large-scale pre-trained models (e.g., IN-21K pre-trained AugRegs or +CLIPs) using a rather smaller amount of training samples (e.g., IN-1K) and +without labels or captions within just a few hours. As a result, the +bootstrapped unimodal SparseFormer (from AugReg-ViT-L/16-384) can reach 84.9% +accuracy on IN-1K with only 49 tokens, and the multimodal SparseFormer from +CLIPs also demonstrates notable zero-shot performance with highly reduced +computational cost without seeing any caption during the bootstrapping +procedure. In addition, CLIP-bootstrapped SparseFormers, which align the output +space with language without seeing a word, can serve as efficient vision +encoders in multimodal large language models. Code and models are available at +https://github.com/showlab/sparseformer + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Unified Spatio-Temporal Tri-Perspective View Representation for 3D + Semantic Occupancy Prediction + + +
+ Holistic understanding and reasoning in 3D scenes play a vital role in the +success of autonomous driving systems. The evolution of 3D semantic occupancy +prediction as a pretraining task for autonomous driving and robotic downstream +tasks capture finer 3D details compared to methods like 3D detection. Existing +approaches predominantly focus on spatial cues such as tri-perspective view +embeddings (TPV), often overlooking temporal cues. This study introduces a +spatiotemporal transformer architecture S2TPVFormer for temporally coherent 3D +semantic occupancy prediction. We enrich the prior process by including +temporal cues using a novel temporal cross-view hybrid attention mechanism +(TCVHA) and generate spatiotemporal TPV embeddings (i.e. S2TPV embeddings). +Experimental evaluations on the nuScenes dataset demonstrate a substantial 4.1% +improvement in mean Intersection over Union (mIoU) for 3D Semantic Occupancy +compared to TPVFormer, confirming the effectiveness of the proposed S2TPVFormer +in enhancing 3D scene perception. + +
+
+
+
+
+ + ♻ ☆ Learning Subject-Aware Cropping by Outpainting Professional Photos + + +
+ How to frame (or crop) a photo often depends on the image subject and its +context; e.g., a human portrait. Recent works have defined the subject-aware +image cropping task as a nuanced and practical version of image cropping. We +propose a weakly-supervised approach (GenCrop) to learn what makes a +high-quality, subject-aware crop from professional stock images. Unlike +supervised prior work, GenCrop requires no new manual annotations beyond the +existing stock image collection. The key challenge in learning from this data, +however, is that the images are already cropped and we do not know what regions +were removed. Our insight is to combine a library of stock images with a +modern, pre-trained text-to-image diffusion model. The stock image collection +provides diversity and its images serve as pseudo-labels for a good crop, while +the text-image diffusion model is used to out-paint (i.e., outward inpainting) +realistic uncropped images. Using this procedure, we are able to automatically +generate a large dataset of cropped-uncropped training pairs to train a +cropping model. Despite being weakly-supervised, GenCrop is competitive with +state-of-the-art supervised methods and significantly better than comparable +weakly-supervised baselines on quantitative and qualitative evaluation metrics. + +
+
+ comment: AAAI 24. Extended version with supplemental materials +
+
+
+
+
+ + ♻ ☆ Non-negative Subspace Feature Representation for Few-shot Learning in + Medical Imaging + + +
+ Unlike typical visual scene recognition domains, in which massive datasets +are accessible to deep neural networks, medical image interpretations are often +obstructed by the paucity of data. In this paper, we investigate the +effectiveness of data-based few-shot learning in medical imaging by exploring +different data attribute representations in a low-dimensional space. We +introduce different types of non-negative matrix factorization (NMF) in +few-shot learning, addressing the data scarcity issue in medical image +classification. Extensive empirical studies are conducted in terms of +validating the effectiveness of NMF, especially its supervised variants (e.g., +discriminative NMF, and supervised and constrained NMF with sparseness), and +the comparison with principal component analysis (PCA), i.e., the collaborative +representation-based dimensionality reduction technique derived from +eigenvectors. With 14 different datasets covering 11 distinct illness +categories, thorough experimental results and comparison with related +techniques demonstrate that NMF is a competitive alternative to PCA for +few-shot learning in medical imaging, and the supervised NMF algorithms are +more discriminative in the subspace with greater effectiveness. Furthermore, we +show that the part-based representation of NMF, especially its supervised +variants, is dramatically impactful in detecting lesion areas in medical +imaging with limited samples. + +
+
+
+
+
+ + ♻ ☆ Data Upcycling Knowledge Distillation for Image Super-Resolution + + +
+ Knowledge distillation (KD) compresses deep neural networks by transferring +task-related knowledge from cumbersome pre-trained teacher models to compact +student models. However, current KD methods for super-resolution (SR) networks +overlook the nature of SR task that the outputs of the teacher model are noisy +approximations to the ground-truth distribution of high-quality images (GT), +which shades the teacher model's knowledge to result in limited KD effects. To +utilize the teacher model beyond the GT upper-bound, we present the Data +Upcycling Knowledge Distillation (DUKD), to transfer the teacher model's +knowledge to the student model through the upcycled in-domain data derived from +training data. Besides, we impose label consistency regularization to KD for SR +by the paired invertible augmentations to improve the student model's +performance and robustness. Comprehensive experiments demonstrate that the DUKD +method significantly outperforms previous arts on several SR tasks. + +
+
+
+
+
+ + ♻ ☆ MorpheuS: Neural Dynamic 360° Surface Reconstruction from Monocular + RGB-D Video CVPR2024 + + +
+ Neural rendering has demonstrated remarkable success in dynamic scene +reconstruction. Thanks to the expressiveness of neural representations, prior +works can accurately capture the motion and achieve high-fidelity +reconstruction of the target object. Despite this, real-world video scenarios +often feature large unobserved regions where neural representations struggle to +achieve realistic completion. To tackle this challenge, we introduce MorpheuS, +a framework for dynamic 360{\deg} surface reconstruction from a casually +captured RGB-D video. Our approach models the target scene as a canonical field +that encodes its geometry and appearance, in conjunction with a deformation +field that warps points from the current frame to the canonical space. We +leverage a view-dependent diffusion prior and distill knowledge from it to +achieve realistic completion of unobserved regions. Experimental results on +various real-world and synthetic datasets show that our method can achieve +high-fidelity 360{\deg} surface reconstruction of a deformable object from a +monocular RGB-D video. + +
+
+ comment: CVPR2024. Project page: + https://hengyiwang.github.io/projects/morpheus +
+
+
+
+
+ + ♻ ☆ Calibrating Bayesian UNet++ for Sub-Seasonal Forecasting ICLR 2024 + + +
+ Seasonal forecasting is a crucial task when it comes to detecting the extreme +heat and colds that occur due to climate change. Confidence in the predictions +should be reliable since a small increase in the temperatures in a year has a +big impact on the world. Calibration of the neural networks provides a way to +ensure our confidence in the predictions. However, calibrating regression +models is an under-researched topic, especially in forecasters. We calibrate a +UNet++ based architecture, which was shown to outperform physics-based models +in temperature anomalies. We show that with a slight trade-off between +prediction error and calibration error, it is possible to get more reliable and +sharper forecasts. We believe that calibration should be an important part of +safety-critical machine learning applications such as weather forecasters. + +
+
+ comment: Accepted as a workshop paper at "ICLR 2024 Tackling Climate Change + with Machine Learning" +
+
+
+
+
+ + ♻ ☆ Roadside Monocular 3D Detection via 2D Detection Prompting + + +
+ The problem of roadside monocular 3D detection requires detecting objects of +interested classes in a 2D RGB frame and predicting their 3D information such +as locations in bird's-eye-view (BEV). It has broad applications in traffic +control, vehicle-vehicle communication, and vehicle-infrastructure cooperative +perception. To approach this problem, we present a novel and simple method by +prompting the 3D detector using 2D detections. Our method builds on a key +insight that, compared with 3D detectors, a 2D detector is much easier to train +and performs significantly better w.r.t detections on the 2D image plane. That +said, one can exploit 2D detections of a well-trained 2D detector as prompts to +a 3D detector, being trained in a way of inflating such 2D detections to 3D +towards 3D detection. To construct better prompts using the 2D detector, we +explore three techniques: (a) concatenating both 2D and 3D detectors' features, +(b) attentively fusing 2D and 3D detectors' features, and (c) encoding +predicted 2D boxes x, y, width, height, label and attentively fusing such with +the 3D detector's features. Surprisingly, the third performs the best. +Moreover, we present a yaw tuning tactic and a class-grouping strategy that +merges classes based on their functionality; these techniques improve 3D +detection performance further. Comprehensive ablation studies and extensive +experiments demonstrate that our method resoundingly outperforms prior works, +achieving the state-of-the-art on two large-scale roadside 3D detection +benchmarks. + +
+
+
+
+
+ + ♻ ☆ Scene-aware Human Motion Forecasting via Mutual Distance Prediction + + +
+ In this paper, we tackle the problem of scene-aware 3D human motion +forecasting. A key challenge of this task is to predict future human motions +that are consistent with the scene by modeling the human-scene interactions. +While recent works have demonstrated that explicit constraints on human-scene +interactions can prevent the occurrence of ghost motion, they only provide +constraints on partial human motion e.g., the global motion of the human or a +few joints contacting the scene, leaving the rest of the motion unconstrained. +To address this limitation, we propose to model the human-scene interaction +with the mutual distance between the human body and the scene. Such mutual +distances constrain both the local and global human motion, resulting in a +whole-body motion constrained prediction. In particular, mutual distance +constraints consist of two components, the signed distance of each vertex on +the human mesh to the scene surface and the distance of basis scene points to +the human mesh. We further introduce a global scene representation learned from +a signed distance function (SDF) volume to ensure coherence between the global +scene representation and the explicit constraint from the mutual distance. We +develop a pipeline with two sequential steps: predicting the future mutual +distances first, followed by forecasting future human motion. During training, +we explicitly encourage consistency between predicted poses and mutual +distances. Extensive evaluations on the existing synthetic and real datasets +demonstrate that our approach consistently outperforms the state-of-the-art +methods. + +
+
+
+
+
+ + ♻ ☆ ShapeFusion: A 3D diffusion model for localized shape editing + + +
+ In the realm of 3D computer vision, parametric models have emerged as a +ground-breaking methodology for the creation of realistic and expressive 3D +avatars. Traditionally, they rely on Principal Component Analysis (PCA), given +its ability to decompose data to an orthonormal space that maximally captures +shape variations. However, due to the orthogonality constraints and the global +nature of PCA's decomposition, these models struggle to perform localized and +disentangled editing of 3D shapes, which severely affects their use in +applications requiring fine control such as face sculpting. In this paper, we +leverage diffusion models to enable diverse and fully localized edits on 3D +meshes, while completely preserving the un-edited regions. We propose an +effective diffusion masking training strategy that, by design, facilitates +localized manipulation of any shape region, without being limited to predefined +regions or to sparse sets of predefined control vertices. Following our +framework, a user can explicitly set their manipulation region of choice and +define an arbitrary set of vertices as handles to edit a 3D mesh. Compared to +the current state-of-the-art our method leads to more interpretable shape +manipulations than methods relying on latent code state, greater localization +and generation diversity while offering faster inference than optimization +based approaches. Project page: https://rolpotamias.github.io/Shapefusion/ + +
+
+ comment: Project Page: https://rolpotamias.github.io/Shapefusion/ +
+
+
+
+
+ + ♻ ☆ Vestibular schwannoma growth prediction from longitudinal MRI by time + conditioned neural fields + + +
+ Vestibular schwannomas (VS) are benign tumors that are generally managed by +active surveillance with MRI examination. To further assist clinical +decision-making and avoid overtreatment, an accurate prediction of tumor growth +based on longitudinal imaging is highly desirable. In this paper, we introduce +DeepGrowth, a deep learning method that incorporates neural fields and +recurrent neural networks for prospective tumor growth prediction. In the +proposed method, each tumor is represented as a signed distance function (SDF) +conditioned on a low-dimensional latent code. Unlike previous studies that +perform tumor shape prediction directly in the image space, we predict the +latent codes instead and then reconstruct future shapes from it. To deal with +irregular time intervals, we introduce a time-conditioned recurrent module +based on a ConvLSTM and a novel temporal encoding strategy, which enables the +proposed model to output varying tumor shapes over time. The experiments on an +in-house longitudinal VS dataset showed that the proposed model significantly +improved the performance ($\ge 1.6\%$ Dice score and $\ge0.20$ mm 95\% +Hausdorff distance), in particular for top 20\% tumors that grow or shrink the +most ($\ge 4.6\%$ Dice score and $\ge 0.73$ mm 95\% Hausdorff distance). Our +code is available at ~\burl{https://github.com/cyjdswx/DeepGrowth} + +
+
+
+
+
+ + ♻ ☆ Smooth Deep Saliency + + +
+ In this work, we investigate methods to reduce the noise in deep saliency +maps coming from convolutional downsampling, with the purpose of explaining how +a deep learning model detects tumors in scanned histological tissue samples. +Those methods make the investigated models more interpretable for +gradient-based saliency maps, computed in hidden layers. We test our approach +on different models trained for image classification on ImageNet1K, and models +trained for tumor detection on Camelyon16 and in-house real-world digital +pathology scans of stained tissue samples. Our results show that the +checkerboard noise in the gradient gets reduced, resulting in smoother and +therefore easier to interpret saliency maps. + +
+
+
+
+
+ + ♻ ☆ Self-Aligning Depth-regularized Radiance Fields for Asynchronous RGB-D + Sequences + + +
+ It has been shown that learning radiance fields with depth rendering and +depth supervision can effectively promote the quality and convergence of view +synthesis. However, this paradigm requires input RGB-D sequences to be +synchronized, hindering its usage in the UAV city modeling scenario. As there +exists asynchrony between RGB images and depth images due to high-speed flight, +we propose a novel time-pose function, which is an implicit network that maps +timestamps to $\rm SE(3)$ elements. To simplify the training process, we also +design a joint optimization scheme to jointly learn the large-scale +depth-regularized radiance fields and the time-pose function. Our algorithm +consists of three steps: (1) time-pose function fitting, (2) radiance field +bootstrapping, (3) joint pose error compensation and radiance field refinement. +In addition, we propose a large synthetic dataset with diverse controlled +mismatches and ground truth to evaluate this new problem setting +systematically. Through extensive experiments, we demonstrate that our method +outperforms baselines without regularization. We also show qualitatively +improved results on a real-world asynchronous RGB-D sequence captured by drone. +Codes, data, and models will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ Beyond Image Super-Resolution for Image Recognition with Task-Driven + Perceptual Loss CVPR 2024 + + +
+ In real-world scenarios, image recognition tasks, such as semantic +segmentation and object detection, often pose greater challenges due to the +lack of information available within low-resolution (LR) content. Image +super-resolution (SR) is one of the promising solutions for addressing the +challenges. However, due to the ill-posed property of SR, it is challenging for +typical SR methods to restore task-relevant high-frequency contents, which may +dilute the advantage of utilizing the SR method. Therefore, in this paper, we +propose Super-Resolution for Image Recognition (SR4IR) that effectively guides +the generation of SR images beneficial to achieving satisfactory image +recognition performance when processing LR images. The critical component of +our SR4IR is the task-driven perceptual (TDP) loss that enables the SR network +to acquire task-specific knowledge from a network tailored for a specific task. +Moreover, we propose a cross-quality patch mix and an alternate training +framework that significantly enhances the efficacy of the TDP loss by +addressing potential problems when employing the TDP loss. Through extensive +experiments, we demonstrate that our SR4IR achieves outstanding task +performance by generating SR images useful for a specific image recognition +task, including semantic segmentation, object detection, and image +classification. The implementation code is available at +https://github.com/JaehaKim97/SR4IR. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ CoDA: Instructive Chain-of-Domain Adaptation with Severity-Aware Visual + Prompt Tuning + + +
+ Unsupervised Domain Adaptation (UDA) aims to adapt models from labeled source +domains to unlabeled target domains. When adapting to adverse scenes, existing +UDA methods fail to perform well due to the lack of instructions, leading their +models to overlook discrepancies within all adverse scenes. To tackle this, we +propose CoDA which instructs models to distinguish, focus, and learn from these +discrepancies at scene and image levels. Specifically, CoDA consists of a +Chain-of-Domain (CoD) strategy and a Severity-Aware Visual Prompt Tuning +(SAVPT) mechanism. CoD focuses on scene-level instructions to divide all +adverse scenes into easy and hard scenes, guiding models to adapt from source +to easy domains with easy scene images, and then to hard domains with hard +scene images, thereby laying a solid foundation for whole adaptations. Building +upon this foundation, we employ SAVPT to dive into more detailed image-level +instructions to boost performance. SAVPT features a novel metric Severity that +divides all adverse scene images into low-severity and high-severity images. +Then Severity directs visual prompts and adapters, instructing models to +concentrate on unified severity features instead of scene-specific features, +without adding complexity to the model architecture. CoDA achieves SOTA +performances on widely-used benchmarks under all adverse scenes. Notably, CoDA +outperforms the existing ones by 4.6%, and 10.3% mIoU on the Foggy Driving, and +Foggy Zurich benchmarks, respectively. Our code is available at +https://github.com/Cuzyoung/CoDA + +
+
+
+
+
+ + ♻ ☆ GEARS: Local Geometry-aware Hand-object Interaction Synthesis + + +
+ Generating realistic hand motion sequences in interaction with objects has +gained increasing attention with the growing interest in digital humans. Prior +work has illustrated the effectiveness of employing occupancy-based or +distance-based virtual sensors to extract hand-object interaction features. +Nonetheless, these methods show limited generalizability across object +categories, shapes and sizes. We hypothesize that this is due to two reasons: +1) the limited expressiveness of employed virtual sensors, and 2) scarcity of +available training data. To tackle this challenge, we introduce a novel +joint-centered sensor designed to reason about local object geometry near +potential interaction regions. The sensor queries for object surface points in +the neighbourhood of each hand joint. As an important step towards mitigating +the learning complexity, we transform the points from global frame to hand +template frame and use a shared module to process sensor features of each +individual joint. This is followed by a spatio-temporal transformer network +aimed at capturing correlation among the joints in different dimensions. +Moreover, we devise simple heuristic rules to augment the limited training +sequences with vast static hand grasping samples. This leads to a broader +spectrum of grasping types observed during training, in turn enhancing our +model's generalization capability. We evaluate on two public datasets, GRAB and +InterCap, where our method shows superiority over baselines both quantitatively +and perceptually. + +
+
+
+
+
+ + ♻ ☆ Bias Behind the Wheel: Fairness Analysis of Autonomous Driving Systems + + +
+ This paper analyzes fairness in automated pedestrian detection, a crucial but +under-explored issue in autonomous driving systems. We evaluate eight +state-of-the-art deep learning-based pedestrian detectors across demographic +groups on large-scale real-world datasets. To enable thorough fairness testing, +we provide extensive annotations for the datasets, resulting in 8,311 images +with 16,070 gender labels, 20,115 age labels, and 3,513 skin tone labels. Our +findings reveal significant fairness issues, particularly related to age. The +undetected proportions for children are 20.14% higher compared to adults. +Furthermore, we explore how various driving scenarios affect the fairness of +pedestrian detectors. We find that pedestrian detectors demonstrate significant +gender biases during night time, potentially exacerbating the prevalent +societal issue of female safety concerns during nighttime out. Moreover, we +observe that pedestrian detectors can demonstrate both enhanced fairness and +superior performance under specific driving conditions, which challenges the +fairness-performance trade-off theory widely acknowledged in the fairness +literature. We publicly release the code, data, and results to support future +research on fairness in autonomous driving. + +
+
+
+
+
+ + ♻ ☆ Weighted structure tensor total variation for image denoising + + +
+ For image denoising problems, the structure tensor total variation +(STV)-based models show good performances when compared with other competing +regularization approaches. However, the STV regularizer does not couple the +local information of the image and may not maintain the image details. +Therefore, we employ the anisotropic weighted matrix introduced in the +anisotropic total variation (ATV) model to improve the STV model. By applying +the weighted matrix to the discrete gradient of the patch-based Jacobian +operator in STV, our proposed weighted STV (WSTV) model can effectively capture +local information from images and maintain their details during the denoising +process. The optimization problem in the model is solved by a fast first-order +gradient projection algorithm with a complexity result of $O(1 / i^2)$. For +images with different Gaussian noise levels, the experimental results +demonstrate that the WSTV model can effectively improve the quality of restored +images compared to other TV and STV-based models. + +
+
+
+
+
+ + ♻ ☆ A Novel Garment Transfer Method Supervised by Distilled Knowledge of + Virtual Try-on Model + + +
+ This paper proposes a novel garment transfer method supervised with knowledge +distillation from virtual try-on. Our method first reasons the transfer parsing +to provide shape prior to downstream tasks. We employ a multi-phase teaching +strategy to supervise the training of the transfer parsing reasoning model, +learning the response and feature knowledge from the try-on parsing reasoning +model. To correct the teaching error, it transfers the garment back to its +owner to absorb the hard knowledge in the self-study phase. Guided by the +transfer parsing, we adjust the position of the transferred garment via STN to +prevent distortion. Afterward, we estimate a progressive flow to precisely warp +the garment with shape and content correspondences. To ensure warping +rationality, we supervise the training of the garment warping model using +target shape and warping knowledge from virtual try-on. To better preserve body +features in the transfer result, we propose a well-designed training strategy +for the arm regrowth task to infer new exposure skin. Experiments demonstrate +that our method has state-of-the-art performance compared with other virtual +try-on and garment transfer methods in garment transfer, especially for +preserving garment texture and body features. + +
+
+
+
+
+ + ♻ ☆ ModaVerse: Efficiently Transforming Modalities with LLMs CVPR2024 + + +
+ Humans possess the capability to comprehend diverse modalities and seamlessly +transfer information between them. In this work, we introduce ModaVerse, a +Multi-modal Large Language Model (MLLM) capable of comprehending and +transforming content across various modalities including images, videos, and +audio. Predominant MLLM frameworks have largely relied on the alignment of +latent spaces of textual and non-textual features. This alignment process, +which synchronizes a language model trained on textual data with encoders and +decoders trained on multi-modal data, often necessitates extensive training of +several projection layers in multiple stages. Inspired by LLM-as-agent +methodologies, we propose a novel Input/Output (I/O) alignment mechanism that +operates directly at the level of natural language. It aligns the LLM's output +with the input of generative models, avoiding the complexities associated with +latent feature alignments, and simplifying the multiple training stages of +existing MLLMs into a single, efficient process. This conceptual advancement +leads to significant reductions in both data and computational costs. By +conducting experiments on several benchmarks, we demonstrate that our approach +attains comparable performance with the state of the art while achieving +considerable efficiencies in data usage and training duration. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ DeepIPC: Deeply Integrated Perception and Control for an Autonomous + Vehicle in Real Environments + + +
+ In this work, we introduce DeepIPC, a novel end-to-end model tailored for +autonomous driving, which seamlessly integrates perception and control tasks. +Unlike traditional models that handle these tasks separately, DeepIPC +innovatively combines a perception module, which processes RGBD images for +semantic segmentation and generates bird's eye view (BEV) mappings, with a +controller module that utilizes these insights along with GNSS and angular +speed measurements to accurately predict navigational waypoints. This +integration allows DeepIPC to efficiently translate complex environmental data +into actionable driving commands. Our comprehensive evaluation demonstrates +DeepIPC's superior performance in terms of drivability and multi-task +efficiency across diverse real-world scenarios, setting a new benchmark for +end-to-end autonomous driving systems with a leaner model architecture. The +experimental results underscore DeepIPC's potential to significantly enhance +autonomous vehicular navigation, promising a step forward in the development of +autonomous driving technologies. For further insights and replication, we will +make our code and datasets available at https://github.com/oskarnatan/DeepIPC. + +
+
+ comment: Accepted for Publication in IEEE Access +
+
+
+
+
+ + ♻ ☆ CAPE: CAM as a Probabilistic Ensemble for Enhanced DNN Interpretation + + +
+ Deep Neural Networks (DNNs) are widely used for visual classification tasks, +but their complex computation process and black-box nature hinder decision +transparency and interpretability. Class activation maps (CAMs) and recent +variants provide ways to visually explain the DNN decision-making process by +displaying 'attention' heatmaps of the DNNs. Nevertheless, the CAM explanation +only offers relative attention information, that is, on an attention heatmap, +we can interpret which image region is more or less important than the others. +However, these regions cannot be meaningfully compared across classes, and the +contribution of each region to the model's class prediction is not revealed. To +address these challenges that ultimately lead to better DNN Interpretation, in +this paper, we propose CAPE, a novel reformulation of CAM that provides a +unified and probabilistically meaningful assessment of the contributions of +image regions. We quantitatively and qualitatively compare CAPE with +state-of-the-art CAM methods on CUB and ImageNet benchmark datasets to +demonstrate enhanced interpretability. We also test on a cytology imaging +dataset depicting a challenging Chronic Myelomonocytic Leukemia (CMML) +diagnosis problem. Code is available at: https://github.com/AIML-MED/CAPE. + +
+
+
+
+
+ + ♻ ☆ Image Outlier Detection Without Training using RANSAC + + +
+ Image outlier detection (OD) is an essential tool to ensure the quality of +images used in computer vision tasks. Existing algorithms often involve +training a model to represent the inlier distribution, and outliers are +determined by some deviation measure. Although existing methods proved +effective when trained on strictly inlier samples, their performance remains +questionable when undesired outliers are included during training. As a result +of this limitation, it is necessary to carefully examine the data when +developing OD models for new domains. In this work, we present a novel image OD +algorithm called RANSAC-NN that eliminates the need of data examination and +model training altogether. Unlike existing approaches, RANSAC-NN can be +directly applied on datasets containing outliers by sampling and comparing +subsets of the data. Our algorithm maintains favorable performance compared to +existing methods on a range of benchmarks. Furthermore, we show that RANSAC-NN +can enhance the robustness of existing methods by incorporating our algorithm +as part of the data preparation process. + +
+
+
+
+
+ + ♻ ☆ DeepIPCv2: LiDAR-powered Robust Environmental Perception and + Navigational Control for Autonomous Vehicle + + +
+ We present DeepIPCv2, an autonomous driving model that perceives the +environment using a LiDAR sensor for more robust drivability, especially when +driving under poor illumination conditions where everything is not clearly +visible. DeepIPCv2 takes a set of LiDAR point clouds as the main perception +input. Since point clouds are not affected by illumination changes, they can +provide a clear observation of the surroundings no matter what the condition +is. This results in a better scene understanding and stable features provided +by the perception module to support the controller module in estimating +navigational control properly. To evaluate its performance, we conduct several +tests by deploying the model to predict a set of driving records and perform +real automated driving under three different conditions. We also conduct +ablation and comparative studies with some recent models to justify its +performance. Based on the experimental results, DeepIPCv2 shows a robust +performance by achieving the best drivability in all driving scenarios. +Furthermore, to support future research, we will upload the codes and data to +https://github.com/oskarnatan/DeepIPCv2. + +
+
+
+
+
+ + ♻ ☆ HumanNeRF-SE: A Simple yet Effective Approach to Animate HumanNeRF with + Diverse Poses + + +
+ We present HumanNeRF-SE, a simple yet effective method that synthesizes +diverse novel pose images with simple input. Previous HumanNeRF works require a +large number of optimizable parameters to fit the human images. Instead, we +reload these approaches by combining explicit and implicit human +representations to design both generalized rigid deformation and specific +non-rigid deformation. Our key insight is that explicit shape can reduce the +sampling points used to fit implicit representation, and frozen blending +weights from SMPL constructing a generalized rigid deformation can effectively +avoid overfitting and improve pose generalization performance. Our architecture +involving both explicit and implicit representation is simple yet effective. +Experiments demonstrate our model can synthesize images under arbitrary poses +with few-shot input and increase the speed of synthesizing images by 15 times +through a reduction in computational complexity without using any existing +acceleration modules. Compared to the state-of-the-art HumanNeRF studies, +HumanNeRF-SE achieves better performance with fewer learnable parameters and +less training time. + +
+
+ comment: 16pages, 17 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ TE-TAD: Towards Full End-to-End Temporal Action Detection via + Time-Aligned Coordinate Expression + + +
+ In this paper, we investigate that the normalized coordinate expression is a +key factor as reliance on hand-crafted components in query-based detectors for +temporal action detection (TAD). Despite significant advancements towards an +end-to-end framework in object detection, query-based detectors have been +limited in achieving full end-to-end modeling in TAD. To address this issue, we +propose \modelname{}, a full end-to-end temporal action detection transformer +that integrates time-aligned coordinate expression. We reformulate coordinate +expression utilizing actual timeline values, ensuring length-invariant +representations from the extremely diverse video duration environment. +Furthermore, our proposed adaptive query selection dynamically adjusts the +number of queries based on video length, providing a suitable solution for +varying video durations compared to a fixed query set. Our approach not only +simplifies the TAD process by eliminating the need for hand-crafted components +but also significantly improves the performance of query-based detectors. Our +TE-TAD outperforms the previous query-based detectors and achieves competitive +performance compared to state-of-the-art methods on popular benchmark datasets. +Code is available at: https://github.com/Dotori-HJ/TE-TAD + +
+
+
+
+
+ + ♻ ☆ Improving the Reconstruction of Disentangled Representation Learners via + Multi-Stage Modeling + + +
+ Current autoencoder-based disentangled representation learning methods +achieve disentanglement by penalizing the (aggregate) posterior to encourage +statistical independence of the latent factors. This approach introduces a +trade-off between disentangled representation learning and reconstruction +quality since the model does not have enough capacity to learn correlated +latent variables that capture detail information present in most image data. To +overcome this trade-off, we present a novel multi-stage modeling approach where +the disentangled factors are first learned using a penalty-based disentangled +representation learning method; then, the low-quality reconstruction is +improved with another deep generative model that is trained to model the +missing correlated latent variables, adding detail information while +maintaining conditioning on the previously learned disentangled factors. Taken +together, our multi-stage modelling approach results in a single, coherent +probabilistic model that is theoretically justified by the principal of +D-separation and can be realized with a variety of model classes including +likelihood-based models such as variational autoencoders, implicit models such +as generative adversarial networks, and tractable models like normalizing flows +or mixtures of Gaussians. We demonstrate that our multi-stage model has higher +reconstruction quality than current state-of-the-art methods with equivalent +disentanglement performance across multiple standard benchmarks. In addition, +we apply the multi-stage model to generate synthetic tabular datasets, +showcasing an enhanced performance over benchmark models across a variety of +metrics. The interpretability analysis further indicates that the multi-stage +model can effectively uncover distinct and meaningful features of variations +from which the original distribution can be recovered. + +
+
+
+
+
+ + ♻ ☆ WM-MoE: Weather-aware Multi-scale Mixture-of-Experts for Blind Adverse + Weather Removal + + +
+ Adverse weather removal tasks like deraining, desnowing, and dehazing are +usually treated as separate tasks. However, in practical autonomous driving +scenarios, the type, intensity,and mixing degree of weather are unknown, so +handling each task separately cannot deal with the complex practical scenarios. +In this paper, we study the blind adverse weather removal problem. +Mixture-of-Experts (MoE) is a popular model that adopts a learnable gate to +route the input to different expert networks. The principle of MoE involves +using adaptive networks to process different types of unknown inputs. +Therefore, MoE has great potential for blind adverse weather removal. However, +the original MoE module is inadequate for coupled multiple weather types and +fails to utilize multi-scale features for better performance. To this end, we +propose a method called Weather-aware Multi-scale MoE (WM-MoE) based on +Transformer for blind weather removal. WM-MoE includes two key designs: +WEather-Aware Router (WEAR) and Multi-Scale Experts (MSE). WEAR assigns experts +for each image token based on decoupled content and weather features, which +enhances the model's capability to process multiple adverse weathers. To obtain +discriminative weather features from images, we propose Weather Guidance +Fine-grained Contrastive Learning (WGF-CL), which utilizes weather cluster +information to guide the assignment of positive and negative samples for each +image token. Since processing different weather types requires different +receptive fields, MSE leverages multi-scale features to enhance the spatial +relationship modeling capability, facilitating the high-quality restoration of +diverse weather types and intensities. Our method achieves state-of-the-art +performance in blind adverse weather removal on two public datasets and our +dataset. We also demonstrate the advantage of our method on downstream +segmentation tasks. + +
+
+
+
+
+ + ♻ ☆ Temporally Consistent Unbalanced Optimal Transport for Unsupervised + Action Segmentation CVPR 2024 + + +
+ We propose a novel approach to the action segmentation task for long, +untrimmed videos, based on solving an optimal transport problem. By encoding a +temporal consistency prior into a Gromov-Wasserstein problem, we are able to +decode a temporally consistent segmentation from a noisy affinity/matching cost +matrix between video frames and action classes. Unlike previous approaches, our +method does not require knowing the action order for a video to attain temporal +consistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can +be efficiently solved on GPUs using a few iterations of projected mirror +descent. We demonstrate the effectiveness of our method in an unsupervised +learning setting, where our method is used to generate pseudo-labels for +self-training. We evaluate our segmentation approach and unsupervised learning +pipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly +datasets, yielding state-of-the-art results for the unsupervised video action +segmentation task. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ EGTR: Extracting Graph from Transformer for Scene Graph Generation CVPR 2024 + + +
+ Scene Graph Generation (SGG) is a challenging task of detecting objects and +predicting relationships between objects. After DETR was developed, one-stage +SGG models based on a one-stage object detector have been actively studied. +However, complex modeling is used to predict the relationship between objects, +and the inherent relationship between object queries learned in the multi-head +self-attention of the object detector has been neglected. We propose a +lightweight one-stage SGG model that extracts the relation graph from the +various relationships learned in the multi-head self-attention layers of the +DETR decoder. By fully utilizing the self-attention by-products, the relation +graph can be extracted effectively with a shallow relation extraction head. +Considering the dependency of the relation extraction task on the object +detection task, we propose a novel relation smoothing technique that adjusts +the relation label adaptively according to the quality of the detected objects. +By the relation smoothing, the model is trained according to the continuous +curriculum that focuses on object detection task at the beginning of training +and performs multi-task learning as the object detection performance gradually +improves. Furthermore, we propose a connectivity prediction task that predicts +whether a relation exists between object pairs as an auxiliary task of the +relation extraction. We demonstrate the effectiveness and efficiency of our +method for the Visual Genome and Open Image V6 datasets. Our code is publicly +available at https://github.com/naver-ai/egtr. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Robust deep learning for eye fundus images: Bridging real and synthetic + data for enhancing generalization + + +
+ Deep learning applications for assessing medical images are limited because +the datasets are often small and imbalanced. The use of synthetic data has been +proposed in the literature, but neither a robust comparison of the different +methods nor generalizability has been reported. Our approach integrates a +retinal image quality assessment model and StyleGAN2 architecture to enhance +Age-related Macular Degeneration (AMD) detection capabilities and improve +generalizability. This work compares ten different Generative Adversarial +Network (GAN) architectures to generate synthetic eye-fundus images with and +without AMD. We combined subsets of three public databases (iChallenge-AMD, +ODIR-2019, and RIADD) to form a single training and test set. We employed the +STARE dataset for external validation, ensuring a comprehensive assessment of +the proposed approach. The results show that StyleGAN2 reached the lowest +Frechet Inception Distance (166.17), and clinicians could not accurately +differentiate between real and synthetic images. ResNet-18 architecture +obtained the best performance with 85% accuracy and outperformed the two human +experts (80% and 75%) in detecting AMD fundus images. The accuracy rates were +82.8% for the test set and 81.3% for the STARE dataset, demonstrating the +model's generalizability. The proposed methodology for synthetic medical image +generation has been validated for robustness and accuracy, with free access to +its code for further research and development in this field. + +
+
+ comment: Accepted by the Biomedical Signal Processing and Control +
+
+
+
+
+ + ♻ ☆ Towards Fine-grained Large Object Segmentation 1st Place Solution to 3D + AI Challenge 2020 -- Instance Segmentation Track + + +
+ This technical report introduces our solutions of Team 'FineGrainedSeg' for +Instance Segmentation track in 3D AI Challenge 2020. In order to handle +extremely large objects in 3D-FUTURE, we adopt PointRend as our basic +framework, which outputs more fine-grained masks compared to HTC and SOLOv2. +Our final submission is an ensemble of 5 PointRend models, which achieves the +1st place on both validation and test leaderboards. The code is available at +https://github.com/zehuichen123/3DFuture_ins_seg. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Deep Learning in Cardiology + + +
+ The medical field is creating large amount of data that physicians are unable +to decipher and use efficiently. Moreover, rule-based expert systems are +inefficient in solving complicated medical tasks or for creating insights using +big data. Deep learning has emerged as a more accurate and effective technology +in a wide range of medical problems such as diagnosis, prediction and +intervention. Deep learning is a representation learning method that consists +of layers that transform the data non-linearly, thus, revealing hierarchical +relationships and structures. In this review we survey deep learning +application papers that use structured data, signal and imaging modalities from +cardiology. We discuss the advantages and limitations of applying deep learning +in cardiology that also apply in medicine in general, while proposing certain +directions as the most viable for clinical use. + +
+
+ comment: 27 pages, 2 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ Synthesis of Annotated Colorectal Cancer Tissue Images from Gland Layout + + +
+ Generating realistic tissue images with annotations is a challenging task +that is important in many computational histopathology applications. +Synthetically generated images and annotations are valuable for training and +evaluating algorithms in this domain. To address this, we propose an +interactive framework generating pairs of realistic colorectal cancer histology +images with corresponding glandular masks from glandular structure layouts. The +framework accurately captures vital features like stroma, goblet cells, and +glandular lumen. Users can control gland appearance by adjusting parameters +such as the number of glands, their locations, and sizes. The generated images +exhibit good Frechet Inception Distance (FID) scores compared to the +state-of-the-art image-to-image translation model. Additionally, we demonstrate +the utility of our synthetic annotations for evaluating gland segmentation +algorithms. Furthermore, we present a methodology for constructing glandular +masks using advanced deep generative models, such as latent diffusion models. +These masks enable tissue image generation through a residual encoder-decoder +network. + +
+
+
+
+
+ + ♻ ☆ Spacetime Gaussian Feature Splatting for Real-Time Dynamic View + Synthesis CVPR 2024 + + +
+ Novel view synthesis of dynamic scenes has been an intriguing yet challenging +problem. Despite recent advancements, simultaneously achieving high-resolution +photorealistic results, real-time rendering, and compact storage remains a +formidable task. To address these challenges, we propose Spacetime Gaussian +Feature Splatting as a novel dynamic scene representation, composed of three +pivotal components. First, we formulate expressive Spacetime Gaussians by +enhancing 3D Gaussians with temporal opacity and parametric motion/rotation. +This enables Spacetime Gaussians to capture static, dynamic, as well as +transient content within a scene. Second, we introduce splatted feature +rendering, which replaces spherical harmonics with neural features. These +features facilitate the modeling of view- and time-dependent appearance while +maintaining small size. Third, we leverage the guidance of training error and +coarse depth to sample new Gaussians in areas that are challenging to converge +with existing pipelines. Experiments on several established real-world datasets +demonstrate that our method achieves state-of-the-art rendering quality and +speed, while retaining compact storage. At 8K resolution, our lite-version +model can render at 60 FPS on an Nvidia RTX 4090 GPU. Our code is available at +https://github.com/oppo-us-research/SpacetimeGaussians. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://oppo-us-research.github.io/SpacetimeGaussians-website/ +
+
+
+
+
+ + ♻ ☆ InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image + Generation + + +
+ Tuning-free diffusion-based models have demonstrated significant potential in +the realm of image personalization and customization. However, despite this +notable progress, current models continue to grapple with several complex +challenges in producing style-consistent image generation. Firstly, the concept +of style is inherently underdetermined, encompassing a multitude of elements +such as color, material, atmosphere, design, and structure, among others. +Secondly, inversion-based methods are prone to style degradation, often +resulting in the loss of fine-grained details. Lastly, adapter-based approaches +frequently require meticulous weight tuning for each reference image to achieve +a balance between style intensity and text controllability. In this paper, we +commence by examining several compelling yet frequently overlooked +observations. We then proceed to introduce InstantStyle, a framework designed +to address these issues through the implementation of two key strategies: 1) A +straightforward mechanism that decouples style and content from reference +images within the feature space, predicated on the assumption that features +within the same space can be either added to or subtracted from one another. 2) +The injection of reference image features exclusively into style-specific +blocks, thereby preventing style leaks and eschewing the need for cumbersome +weight tuning, which often characterizes more parameter-heavy designs.Our work +demonstrates superior visual stylization outcomes, striking an optimal balance +between the intensity of style and the controllability of textual elements. Our +codes will be available at https://github.com/InstantStyle/InstantStyle. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ DisCo: Disentangled Control for Realistic Human Dance Generation CVPR24 + + +
+ Generative AI has made significant strides in computer vision, particularly +in text-driven image/video synthesis (T2I/T2V). Despite the notable +advancements, it remains challenging in human-centric content synthesis such as +realistic dance generation. Current methodologies, primarily tailored for human +motion transfer, encounter difficulties when confronted with real-world dance +scenarios (e.g., social media dance), which require to generalize across a wide +spectrum of poses and intricate human details. In this paper, we depart from +the traditional paradigm of human motion transfer and emphasize two additional +critical attributes for the synthesis of human dance content in social media +contexts: (i) Generalizability: the model should be able to generalize beyond +generic human viewpoints as well as unseen human subjects, backgrounds, and +poses; (ii) Compositionality: it should allow for the seamless composition of +seen/unseen subjects, backgrounds, and poses from different sources. To address +these challenges, we introduce DISCO, which includes a novel model architecture +with disentangled control to improve the compositionality of dance synthesis, +and an effective human attribute pre-training for better generalizability to +unseen humans. Extensive qualitative and quantitative results demonstrate that +DisCc can generate high-quality human dance images and videos with diverse +appearances and flexible motions. Code is available at +https://disco-dance.github.io/. + +
+
+ comment: Accepted by CVPR24 +
+
+
+
+
+ + ♻ ☆ pixelSplat: 3D Gaussian Splats from Image Pairs for Scalable + Generalizable 3D Reconstruction + + +
+ We introduce pixelSplat, a feed-forward model that learns to reconstruct 3D +radiance fields parameterized by 3D Gaussian primitives from pairs of images. +Our model features real-time and memory-efficient rendering for scalable +training as well as fast 3D reconstruction at inference time. To overcome local +minima inherent to sparse and locally supported representations, we predict a +dense probability distribution over 3D and sample Gaussian means from that +probability distribution. We make this sampling operation differentiable via a +reparameterization trick, allowing us to back-propagate gradients through the +Gaussian splatting representation. We benchmark our method on wide-baseline +novel view synthesis on the real-world RealEstate10k and ACID datasets, where +we outperform state-of-the-art light field transformers and accelerate +rendering by 2.5 orders of magnitude while reconstructing an interpretable and +editable 3D radiance field. + +
+
+ comment: Project page: https://dcharatan.github.io/pixelsplat +
+
+
+
+
+ + ♻ ☆ ArtGPT-4: Towards Artistic-understanding Large Vision-Language Models + with Enhanced Adapter + + +
+ The success of large language models (LLMs) has inspired an emerging research +field of multimodal learning. However, a grand challenge of exploiting LLMs for +multimodal learning is the size of pre-trained LLMs which are always with +billions of parameters. To tackle this challenge, models such as MiniGPT-4 and +LLaVA have been developed to fine-tune the pre-trained models using fewer +parameters. Despite their promising performance, these models remain limited in +their understanding of artistic imagery. To facilitate better +artistic-understanding, in this paper, we propose ArtGPT-4, a pioneering large +vision-language model tailored to address the limitations of existing models in +artistic comprehension. The key innovation of ArtGPT-4 lies in its craft for +the sophisticated challenge of artistic image comprehension, setting it apart +from other models that overlook fine details for broader themes. Specifically, +it works by integrating some specialized adapter layers into the LLM, enabling +the model to more efficiently and effectively parse and interpret complex +visual tokens, instead of fine-tuning the whole LLM as in the existing method. +ArtGPT-4 has demonstrated its outstanding performance on the efficiency: +utilizing a Tesla A100 device, its training can be completed in mere 2 hours +with an image-text pair dataset comprising approximately 0.52M entries. +Additionally, ArtGPT-4 has also achieved state-of-the-art performance on the +ArtEmis and ArtEmis-v2.0 datasets as well as the benchmarks established in this +work, lagging behind professional artists' descriptions by a negligible 0.15 +points on a 6-point scale. The outstanding performance of ArtGPT-4 shows that +it can render images with an artistic-understanding and convey the emotions +they inspire, mirroring human interpretation. The code and the pre-trained +model are accessible in \url{https://github.com/DLYuanGod/ArtGPT-4}. + +
+
+
+
+
+ + ♻ ☆ TinyGPT-V: Efficient Multimodal Large Language Model via Small Backbones + + +
+ In recent years, multimodal large language models (MLLMs) such as GPT-4V have +demonstrated remarkable advancements, excelling in a variety of vision-language +tasks. Despite their prowess, the closed-source nature and computational +demands of such models limit their accessibility and applicability. This study +introduces TinyGPT-V, a novel open-source MLLM, designed for efficient training +and inference across various vision-language tasks, including image captioning +(IC) and visual question answering (VQA). Leveraging a compact yet powerful +architecture, TinyGPT-V integrates the Phi-2 language model with pre-trained +vision encoders, utilizing a unique mapping module for visual and linguistic +information fusion. With a training regimen optimized for small backbones and +employing a diverse dataset amalgam, TinyGPT-V requires significantly lower +computational resources 24GB for training and as little as 8GB for inference +without compromising on performance. Our experiments demonstrate that +TinyGPT-V, with its language model 2.8 billion parameters, achieves comparable +results in VQA and image inference tasks to its larger counterparts while being +uniquely suited for deployment on resource-constrained devices through +innovative quantization techniques. This work not only paves the way for more +accessible and efficient MLLMs but also underscores the potential of smaller, +optimized models in bridging the gap between high performance and computational +efficiency in real-world applications. Additionally, this paper introduces a +new approach to multimodal large language models using smaller backbones. Our +code and training weights are available in +\url{https://github.com/DLYuanGod/TinyGPT-V}. + +
+
+
+
+
+ + ♻ ☆ Few-shot point cloud reconstruction and denoising via learned Guassian + splats renderings and fine-tuned diffusion features + + +
+ Existing deep learning methods for the reconstruction and denoising of point +clouds rely on small datasets of 3D shapes. We circumvent the problem by +leveraging deep learning methods trained on billions of images. We propose a +method to reconstruct point clouds from few images and to denoise point clouds +from their rendering by exploiting prior knowledge distilled from image-based +deep learning models. To improve reconstruction in constraint settings, we +regularize the training of a differentiable renderer with hybrid surface and +appearance by introducing semantic consistency supervision. In addition, we +propose a pipeline to finetune Stable Diffusion to denoise renderings of noisy +point clouds and we demonstrate how these learned filters can be used to remove +point cloud noise coming without 3D supervision. We compare our method with DSS +and PointRadiance and achieved higher quality 3D reconstruction on the +Sketchfab Testset and SCUT Dataset. + +
+
+
+
+
+ + ♻ ☆ 3D scene generation from scene graphs and self-attention + + +
+ Synthesizing realistic and diverse indoor 3D scene layouts in a controllable +fashion opens up applications in simulated navigation and virtual reality. As +concise and robust representations of a scene, scene graphs have proven to be +well-suited as the semantic control on the generated layout. We present a +variant of the conditional variational autoencoder (cVAE) model to synthesize +3D scenes from scene graphs and floor plans. We exploit the properties of +self-attention layers to capture high-level relationships between objects in a +scene, and use these as the building blocks of our model. Our model, leverages +graph transformers to estimate the size, dimension and orientation of the +objects in a room while satisfying relationships in the given scene graph. Our +experiments shows self-attention layers leads to sparser (7.9x compared to +Graphto3D) and more diverse scenes (16%). + +
+
+
+
+
+ + ♻ ☆ Neural Field Convolutions by Repeated Differentiation + + +
+ Neural fields are evolving towards a general-purpose continuous +representation for visual computing. Yet, despite their numerous appealing +properties, they are hardly amenable to signal processing. As a remedy, we +present a method to perform general continuous convolutions with general +continuous signals such as neural fields. Observing that piecewise polynomial +kernels reduce to a sparse set of Dirac deltas after repeated differentiation, +we leverage convolution identities and train a repeated integral field to +efficiently execute large-scale convolutions. We demonstrate our approach on a +variety of data modalities and spatially-varying kernels. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 31 + +
+
+
+ + ☆ LidarDM: Generative LiDAR Simulation in a Generated World + + +
+ We present LidarDM, a novel LiDAR generative model capable of producing +realistic, layout-aware, physically plausible, and temporally coherent LiDAR +videos. LidarDM stands out with two unprecedented capabilities in LiDAR +generative modeling: (i) LiDAR generation guided by driving scenarios, offering +significant potential for autonomous driving simulations, and (ii) 4D LiDAR +point cloud generation, enabling the creation of realistic and temporally +coherent sequences. At the heart of our model is a novel integrated 4D world +generation framework. Specifically, we employ latent diffusion models to +generate the 3D scene, combine it with dynamic actors to form the underlying 4D +world, and subsequently produce realistic sensory observations within this +virtual environment. Our experiments indicate that our approach outperforms +competing algorithms in realism, temporal coherency, and layout consistency. We +additionally show that LidarDM can be used as a generative world model +simulator for training and testing perception models. + +
+
+
+
+
+ + ☆ Learning Quadrupedal Locomotion via Differentiable Simulation + + +
+ The emergence of differentiable simulators enabling analytic gradient +computation has motivated a new wave of learning algorithms that hold the +potential to significantly increase sample efficiency over traditional +Reinforcement Learning (RL) methods. While recent research has demonstrated +performance gains in scenarios with comparatively smooth dynamics and, thus, +smooth optimization landscapes, research on leveraging differentiable +simulators for contact-rich scenarios, such as legged locomotion, is scarce. +This may be attributed to the discontinuous nature of contact, which introduces +several challenges to optimizing with analytic gradients. The purpose of this +paper is to determine if analytic gradients can be beneficial even in the face +of contact. Our investigation focuses on the effects of different soft and hard +contact models on the learning process, examining optimization challenges +through the lens of contact simulation. We demonstrate the viability of +employing analytic gradients to learn physically plausible locomotion skills +with a quadrupedal robot using Short-Horizon Actor-Critic (SHAC), a learning +algorithm leveraging analytic gradients, and draw a comparison to a +state-of-the-art RL algorithm, Proximal Policy Optimization (PPO), to +understand the benefits of analytic gradients. + +
+
+
+
+
+ + ☆ A Survey of Optimization-based Task and Motion Planning: From Classical + To Learning Approaches + + +
+ Task and Motion Planning (TAMP) integrates high-level task planning and +low-level motion planning to equip robots with the autonomy to effectively +reason over long-horizon, dynamic tasks. Optimization-based TAMP focuses on +hybrid optimization approaches that define goal conditions via objective +functions and are capable of handling open-ended goals, robotic dynamics, and +physical interaction between the robot and the environment. Therefore, +optimization-based TAMP is particularly suited to solve highly complex, +contact-rich locomotion and manipulation problems. This survey provides a +comprehensive review on optimization-based TAMP, covering (i) planning domain +representations, including action description languages and temporal logic, +(ii) individual solution strategies for components of TAMP, including AI +planning and trajectory optimization (TO), and (iii) the dynamic interplay +between logic-based task planning and model-based TO. A particular focus of +this survey is to highlight the algorithm structures to efficiently solve TAMP, +especially hierarchical and distributed approaches. Additionally, the survey +emphasizes the synergy between the classical methods and contemporary +learning-based innovations such as large language models. Furthermore, the +future research directions for TAMP is discussed in this survey, highlighting +both algorithmic and application-specific challenges. + +
+
+ comment: 24 pages, 12 figures, submitted for review +
+
+
+
+
+ + ☆ Planning for Robust Open-loop Pushing: Exploiting Quasi-static Belief + Dynamics and Contact-informed Optimization IJRR + + +
+ Non-prehensile manipulation such as pushing is typically subject to +uncertain, non-smooth dynamics. However, modeling the uncertainty of the +dynamics typically results in intractable belief dynamics, making +data-efficient planning under uncertainty difficult. This article focuses on +the problem of efficiently generating robust open-loop pushing plans. First, we +investigate how the belief over object configurations propagates through +quasi-static contact dynamics. We exploit the simplified dynamics to predict +the variance of the object configuration without sampling from a perturbation +distribution. In a sampling-based trajectory optimization algorithm, the gain +of the variance is constrained in order to enforce robustness of the plan. +Second, we propose an informed trajectory sampling mechanism for drawing robot +trajectories that are likely to make contact with the object. This sampling +mechanism is shown to significantly improve chances of finding robust +solutions, especially when making-and-breaking contacts is required. We +demonstrate that the proposed approach is able to synthesize bi-manual pushing +trajectories, resulting in successful long-horizon pushing maneuvers without +exteroceptive feedback such as vision or tactile feedback. + +
+
+ comment: submitted to the International Journal of Robotics Research (IJRR) +
+
+
+
+
+ + ☆ Forming Large Patterns with Local Robots in the OBLOT Model + + +
+ In the arbitrary pattern formation problem, $n$ autonomous, mobile robots +must form an arbitrary pattern $P \subseteq \mathbb{R}^2$. The (deterministic) +robots are typically assumed to be indistinguishable, disoriented, and unable +to communicate. An important distinction is whether robots have memory and/or a +limited viewing range. Previous work managed to form $P$ under a natural +symmetry condition if robots have no memory but an unlimited viewing range [22] +or if robots have a limited viewing range but memory [25]. In the latter case, +$P$ is only formed in a shrunk version that has constant diameter. + Without memory and with limited viewing range, forming arbitrary patterns +remains an open problem. We provide a partial solution by showing that $P$ can +be formed under the same symmetry condition if the robots' initial diameter is +$\leq 1$. Our protocol partitions $P$ into rotation-symmetric components and +exploits the initial mutual visibility to form one cluster per component. Using +a careful placement of the clusters and their robots, we show that a cluster +can move in a coordinated way through its component while drawing $P$ by +dropping one robot per pattern coordinate. + +
+
+ comment: 24 pages, 3 figures, submitted for SAND 2024, version with extended + appendix +
+
+
+
+
+ + ☆ Unsupervised Learning of Effective Actions in Robotics + + +
+ Learning actions that are relevant to decision-making and can be executed +effectively is a key problem in autonomous robotics. Current state-of-the-art +action representations in robotics lack proper effect-driven learning of the +robot's actions. Although successful in solving manipulation tasks, deep +learning methods also lack this ability, in addition to their high cost in +terms of memory or training data. In this paper, we propose an unsupervised +algorithm to discretize a continuous motion space and generate "action +prototypes", each producing different effects in the environment. After an +exploration phase, the algorithm automatically builds a representation of the +effects and groups motions into action prototypes, where motions more likely to +produce an effect are represented more than those that lead to negligible +changes. We evaluate our method on a simulated stair-climbing reinforcement +learning task, and the preliminary results show that our effect driven +discretization outperforms uniformly and randomly sampled discretizations in +convergence speed and maximum reward. + +
+
+ comment: Accepted at The First Austrian Symposium on AI, Robotics, and Vision + (AIROV24) +
+
+
+
+
+ + ☆ One Stack to Rule them All: To Drive Automated Vehicles, and Reach for + the 4th level + + +
+ Most automated driving functions are designed for a specific task or vehicle. +Most often, the underlying architecture is fixed to specific algorithms to +increase performance. Therefore, it is not possible to deploy new modules and +algorithms easily. In this paper, we present our automated driving stack which +combines both scalability and adaptability. Due to the modular design, our +stack allows for a fast integration and testing of novel and state-of-the-art +research approaches. Furthermore, it is flexible to be used for our different +testing vehicles, including modified EasyMile EZ10 shuttles and different +passenger cars. These vehicles differ in multiple ways, e.g. sensor setups, +control systems, maximum speed, or steering angle limitations. Finally, our +stack is deployed in real world environments, including passenger transport in +urban areas. Our stack includes all components needed for operating an +autonomous vehicle, including localization, perception, planning, controller, +and additional safety modules. Our stack is developed, tested, and evaluated in +real world traffic in multiple test sites, including the Test Area Autonomous +Driving Baden-W\"urttemberg. + +
+
+
+
+
+ + ☆ Leveraging Swarm Intelligence to Drive Autonomously: A Particle Swarm + Optimization based Approach to Motion Planning + + +
+ Motion planning is an essential part of autonomous mobile platforms. A good +pipeline should be modular enough to handle different vehicles, environments, +and perception modules. The planning process has to cope with all the different +modalities and has to have a modular and flexible design. But most importantly, +it has to be safe and robust. In this paper, we want to present our motion +planning pipeline with particle swarm optimization (PSO) at its core. This +solution is independent of the vehicle type and has a clear and +simple-to-implement interface for perception modules. Moreover, the approach +stands out for being easily adaptable to new scenarios. Parallel calculation +allows for fast planning cycles. Following the principles of PSO, the +trajectory planer first generates a swarm of initial trajectories that are +optimized afterward. We present the underlying control space and inner +workings. Finally, the application to real-world automated driving is shown in +the evaluation with a deeper look at the modeling of the cost function. The +approach is used in our automated shuttles that have already driven more than +3.500 km safely and entirely autonomously in sub-urban everyday traffic. + +
+
+
+
+
+ + ☆ Determining the Tactical Challenge of Scenarios to Efficiently Test + Automated Driving Systems + + +
+ The selection of relevant test scenarios for the scenario-based testing and +safety validation of automated driving systems (ADSs) remains challenging. An +important aspect of the relevance of a scenario is the challenge it poses for +an ADS. Existing methods for calculating the challenge of a scenario aim to +express the challenge in terms of a metric value. Metric values are useful to +select the least or most challenging scenario. However, they fail to provide +human-interpretable information on the cause of the challenge which is critical +information for the efficient selection of relevant test scenarios. Therefore, +this paper presents the Challenge Description Method that mitigates this issue +by analyzing scenarios and providing a description of their challenge in terms +of the minimum required lane changes and their difficulty. Applying the method +to different highway scenarios showed that it is capable of analyzing complex +scenarios and providing easy-to-understand descriptions that can be used to +select relevant test scenarios. + +
+
+ comment: 6 pages, 3 figures, 2 tables; Accepted to be published as part of the + 35th IEEE Intelligent Vehicles Symposium (IV), Jeju Shinhwa World, Jeju + Island, Korea, June 2-5, 2024 +
+
+
+
+
+ + ☆ SliceIt! -- A Dual Simulator Framework for Learning Robot Food Slicing ICRA 2024 + + +
+ Cooking robots can enhance the home experience by reducing the burden of +daily chores. However, these robots must perform their tasks dexterously and +safely in shared human environments, especially when handling dangerous tools +such as kitchen knives. This study focuses on enabling a robot to autonomously +and safely learn food-cutting tasks. More specifically, our goal is to enable a +collaborative robot or industrial robot arm to perform food-slicing tasks by +adapting to varying material properties using compliance control. Our approach +involves using Reinforcement Learning (RL) to train a robot to compliantly +manipulate a knife, by reducing the contact forces exerted by the food items +and by the cutting board. However, training the robot in the real world can be +inefficient, and dangerous, and result in a lot of food waste. Therefore, we +proposed SliceIt!, a framework for safely and efficiently learning robot +food-slicing tasks in simulation. Following a real2sim2real approach, our +framework consists of collecting a few real food slicing data, calibrating our +dual simulation environment (a high-fidelity cutting simulator and a robotic +simulator), learning compliant control policies on the calibrated simulation +environment, and finally, deploying the policies on the real robot. + +
+
+ comment: Accepted to ICRA 2024 +
+
+
+
+
+ + ☆ Fusing Multi-sensor Input with State Information on TinyML Brains for + Autonomous Nano-drones + + +
+ Autonomous nano-drones (~10 cm in diameter), thanks to their ultra-low power +TinyML-based brains, are capable of coping with real-world environments. +However, due to their simplified sensors and compute units, they are still far +from the sense-and-act capabilities shown in their bigger counterparts. This +system paper presents a novel deep learning-based pipeline that fuses +multi-sensorial input (i.e., low-resolution images and 8x8 depth map) with the +robot's state information to tackle a human pose estimation task. Thanks to our +design, the proposed system -- trained in simulation and tested on a real-world +dataset -- improves a state-unaware State-of-the-Art baseline by increasing the +R^2 regression metric up to 0.10 on the distance's prediction. + +
+
+
+
+
+ + ☆ Versatile Scene-Consistent Traffic Scenario Generation as Optimization + with Diffusion + + +
+ Generating realistic and controllable agent behaviors in traffic simulation +is crucial for the development of autonomous vehicles. This problem is often +formulated as imitation learning (IL) from real-world driving data by either +directly predicting future trajectories or inferring cost functions with +inverse optimal control. In this paper, we draw a conceptual connection between +IL and diffusion-based generative modeling and introduce a novel framework +Versatile Behavior Diffusion (VBD) to simulate interactive scenarios with +multiple traffic participants. Our model not only generates scene-consistent +multi-agent interactions but also enables scenario editing through multi-step +guidance and refinement. Experimental evaluations show that VBD achieves +state-of-the-art performance on the Waymo Sim Agents benchmark. In addition, we +illustrate the versatility of our model by adapting it to various applications. +VBD is capable of producing scenarios conditioning on priors, integrating with +model-based optimization, sampling multi-modal scene-consistent scenarios by +fusing marginal predictions, and generating safety-critical scenarios when +combined with a game-theoretic solver. + +
+
+
+
+
+ + ☆ On-the-Go Tree Detection and Geometric Traits Estimation with Ground + Mobile Robots in Fruit Tree Groves + + +
+ By-tree information gathering is an essential task in precision agriculture +achieved by ground mobile sensors, but it can be time- and labor-intensive. In +this paper we present an algorithmic framework to perform real-time and +on-the-go detection of trees and key geometric characteristics (namely, width +and height) with wheeled mobile robots in the field. Our method is based on the +fusion of 2D domain-specific data (normalized difference vegetation index +[NDVI] acquired via a red-green-near-infrared [RGN] camera) and 3D LiDAR point +clouds, via a customized tree landmark association and parameter estimation +algorithm. The proposed system features a multi-modal and entropy-based +landmark correspondences approach, integrated into an underlying Kalman filter +system to recognize the surrounding trees and jointly estimate their spatial +and vegetation-based characteristics. Realistic simulated tests are used to +evaluate our proposed algorithm's behavior in a variety of settings. Physical +experiments in agricultural fields help validate our method's efficacy in +acquiring accurate by-tree information on-the-go and in real-time by employing +only onboard computational and sensing resources. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ☆ Tightly-Coupled LiDAR-IMU-Wheel Odometry with Online Calibration of a + Kinematic Model for Skid-Steering Robots + + +
+ Tunnels and long corridors are challenging environments for mobile robots +because a LiDAR point cloud should degenerate in these environments. To tackle +point cloud degeneration, this study presents a tightly-coupled LiDAR-IMU-wheel +odometry algorithm with an online calibration for skid-steering robots. We +propose a full linear wheel odometry factor, which not only serves as a motion +constraint but also performs the online calibration of kinematic models for +skid-steering robots. Despite the dynamically changing kinematic model (e.g., +wheel radii changes caused by tire pressures) and terrain conditions, our +method can address the model error via online calibration. Moreover, our method +enables an accurate localization in cases of degenerated environments, such as +long and straight corridors, by calibration while the LiDAR-IMU fusion +sufficiently operates. Furthermore, we estimate the uncertainty (i.e., +covariance matrix) of the wheel odometry online for creating a reasonable +constraint. The proposed method is validated through three experiments. The +first indoor experiment shows that the proposed method is robust in severe +degeneracy cases (long corridors) and changes in the wheel radii. The second +outdoor experiment demonstrates that our method accurately estimates the sensor +trajectory despite being in rough outdoor terrain owing to online uncertainty +estimation of wheel odometry. The third experiment shows the proposed online +calibration enables robust odometry estimation in changing terrains. + +
+
+
+
+
+ + ☆ Safe Returning FaSTrack with Robust Control Lyapunov-Value Functions + + +
+ Real-time navigation in a priori unknown environment remains a challenging +task, especially when an unexpected (unmodeled) disturbance occurs. In this +paper, we propose the framework Safe Returning Fast and Safe Tracking (SR-F) +that merges concepts from 1) Robust Control Lyapunov-Value Functions (R-CLVF), +and 2) the Fast and Safe Tracking (FaSTrack) framework. The SR-F computes an +R-CLVF offline between a model of the true system and a simplified planning +model. Online, a planning algorithm is used to generate a trajectory in the +simplified planning space, and the R-CLVF is used to provide a tracking +controller that exponentially stabilizes to the planning model. When an +unexpected disturbance occurs, the proposed SR-F algorithm provides a means for +the true system to recover to the planning model. We take advantage of this +mechanism to induce an artificial disturbance by ``jumping'' the planning model +in open environments, forcing faster navigation. Therefore, this algorithm can +both reject unexpected true disturbances and accelerate navigation speed. We +validate our framework using a 10D quadrotor system and show that SR-F is +empirically 20\% faster than the original FaSTrack while maintaining safety. + +
+
+ comment: 6 pages, 4 figures, 1 table, 2 algorithms. Submitted to LCSS on 03/06 +
+
+
+
+
+ + ☆ Decision Transformer as a Foundation Model for Partially Observable + Continuous Control + + +
+ Closed-loop control of nonlinear dynamical systems with partial-state +observability demands expert knowledge of a diverse, less standardized set of +theoretical tools. Moreover, it requires a delicate integration of controller +and estimator designs to achieve the desired system behavior. To establish a +general controller synthesis framework, we explore the Decision Transformer +(DT) architecture. Specifically, we first frame the control task as predicting +the current optimal action based on past observations, actions, and rewards, +eliminating the need for a separate estimator design. Then, we leverage the +pre-trained language models, i.e., the Generative Pre-trained Transformer (GPT) +series, to initialize DT and subsequently train it for control tasks using +low-rank adaptation (LoRA). Our comprehensive experiments across five distinct +control tasks, ranging from maneuvering aerospace systems to controlling +partial differential equations (PDEs), demonstrate DT's capability to capture +the parameter-agnostic structures intrinsic to control tasks. DT exhibits +remarkable zero-shot generalization abilities for completely new tasks and +rapidly surpasses expert performance levels with a minimal amount of +demonstration data. These findings highlight the potential of DT as a +foundational controller for general control applications. + +
+
+ comment: Submitted to CDC 2024 +
+
+
+
+
+ + ☆ Multi-Robot Planning for Filming Groups of Moving Actors Leveraging + Submodularity and Pixel Density + + +
+ Observing and filming a group of moving actors with a team of aerial robots +is a challenging problem that combines elements of multi-robot coordination, +coverage, and view planning. A single camera may observe multiple actors at +once, and the robot team may observe individual actors from multiple views. As +actors move about, groups may split, merge, and reform, and robots filming +these actors should be able to adapt smoothly to such changes in actor +formations. Rather than adopt an approach based on explicit formations or +assignments, we propose an approach based on optimizing views directly. We +model actors as moving polyhedra and compute approximate pixel densities for +each face and camera view. Then, we propose an objective that exhibits +diminishing returns as pixel densities increase from repeated observation. This +gives rise to a multi-robot perception planning problem which we solve via a +combination of value iteration and greedy submodular maximization. %using a +combination of value iteration to optimize views for individual robots and +sequential submodular maximization methods to coordinate the team. We evaluate +our approach on challenging scenarios modeled after various kinds of social +behaviors and featuring different numbers of robots and actors and observe that +robot assignments and formations arise implicitly based on the movements of +groups of actors. Simulation results demonstrate that our approach consistently +outperforms baselines, and in addition to performing well with the planner's +approximation of pixel densities our approach also performs comparably for +evaluation based on rendered views. Overall, the multi-round variant of the +sequential planner we propose meets (within 1%) or exceeds the formation and +assignment baselines in all scenarios we consider. + +
+
+ comment: 10 pages, 5 figures, submitted to CDC 2024 +
+
+
+
+
+ + ☆ Low Frequency Sampling in Model Predictive Path Integral Control RA-L + + +
+ Sampling-based model-predictive controllers have become a powerful +optimization tool for planning and control problems in various challenging +environments. In this paper, we show how the default choice of uncorrelated +Gaussian distributions can be improved upon with the use of a colored noise +distribution. Our choice of distribution allows for the emphasis on low +frequency control signals, which can result in smoother and more exploratory +samples. We use this frequency-based sampling distribution with Model +Predictive Path Integral (MPPI) in both hardware and simulation experiments to +show better or equal performance on systems with various speeds of input +response. + +
+
+ comment: Accepted to RA-L +
+
+
+
+
+ + ☆ Unsupervised, Bottom-up Category Discovery for Symbol Grounding with a + Curious Robot + + +
+ Towards addressing the Symbol Grounding Problem and motivated by early +childhood language development, we leverage a robot which has been equipped +with an approximate model of curiosity with particular focus on bottom-up +building of unsupervised categories grounded in the physical world. That is, +rather than starting with a top-down symbol (e.g., a word referring to an +object) and providing meaning through the application of predetermined samples, +the robot autonomously and gradually breaks up its exploration space into a +series of increasingly specific unlabeled categories at which point an external +expert may optionally provide a symbol association. We extend prior work by +using a robot that can observe the visual world, introducing a higher +dimensional sensory space, and using a more generalizable method of category +building. Our experiments show that the robot learns categories based on +actions and what it visually observes, and that those categories can be +symbolically grounded into.https://info.arxiv.org/help/prep#comments + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Self-supervised 6-DoF Robot Grasping by Demonstration via Augmented + Reality Teleoperation System + + +
+ Most existing 6-DoF robot grasping solutions depend on strong supervision on +grasp pose to ensure satisfactory performance, which could be laborious and +impractical when the robot works in some restricted area. To this end, we +propose a self-supervised 6-DoF grasp pose detection framework via an Augmented +Reality (AR) teleoperation system that can efficiently learn human +demonstrations and provide 6-DoF grasp poses without grasp pose annotations. +Specifically, the system collects the human demonstration from the AR +environment and contrastively learns the grasping strategy from the +demonstration. For the real-world experiment, the proposed system leads to +satisfactory grasping abilities and learning to grasp unknown objects within +three demonstrations. + +
+
+
+
+
+ + ☆ Distributionally Robust Policy and Lyapunov-Certificate Learning + + +
+ This article presents novel methods for synthesizing distributionally robust +stabilizing neural controllers and certificates for control systems under model +uncertainty. A key challenge in designing controllers with stability guarantees +for uncertain systems is the accurate determination of and adaptation to shifts +in model parametric uncertainty during online deployment. We tackle this with a +novel distributionally robust formulation of the Lyapunov derivative chance +constraint ensuring a monotonic decrease of the Lyapunov certificate. To avoid +the computational complexity involved in dealing with the space of probability +measures, we identify a sufficient condition in the form of deterministic +convex constraints that ensures the Lyapunov derivative constraint is +satisfied. We integrate this condition into a loss function for training a +neural network-based controller and show that, for the resulting closed-loop +system, the global asymptotic stability of its equilibrium can be certified +with high confidence, even with Out-of-Distribution (OoD) model uncertainties. +To demonstrate the efficacy and efficiency of the proposed methodology, we +compare it with an uncertainty-agnostic baseline approach and several +reinforcement learning approaches in two control problems in simulation. + +
+
+ comment: Submitted to IEEE Open Journal of Control Systems +
+
+
+
+
+ + ♻ ☆ Implicit Neural Representations for Breathing-compensated Volume + Reconstruction in Robotic Ultrasound + + +
+ Ultrasound (US) imaging is widely used in diagnosing and staging abdominal +diseases due to its lack of non-ionizing radiation and prevalent availability. +However, significant inter-operator variability and inconsistent image +acquisition hinder the widespread adoption of extensive screening programs. +Robotic ultrasound systems have emerged as a promising solution, offering +standardized acquisition protocols and the possibility of automated +acquisition. Additionally, these systems enable access to 3D data via robotic +tracking, enhancing volumetric reconstruction for improved ultrasound +interpretation and precise disease diagnosis. However, the interpretability of +3D US reconstruction of abdominal images can be affected by the patient's +breathing motion. This study introduces a method to compensate for breathing +motion in 3D US compounding by leveraging implicit neural representations. Our +approach employs a robotic ultrasound system for automated screenings. To +demonstrate the method's effectiveness, we evaluate our proposed method for the +diagnosis and monitoring of abdominal aorta aneurysms as a representative use +case. Our experiments demonstrate that our proposed pipeline facilitates robust +automated robotic acquisition, mitigating artifacts from breathing motion, and +yields smoother 3D reconstructions for enhanced screening and medical +diagnosis. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ An evaluation of CFEAR Radar Odometry + + +
+ This article describes the method CFEAR Radar odometry, submitted to a +competition at the Radar in Robotics workshop, ICRA 20241. CFEAR is an +efficient and accurate method for spinning 2D radar odometry that generalizes +well across environments. This article presents an overview of the odometry +pipeline with new experiments on the public Boreas dataset. We show that a +real-time capable configuration of CFEAR - with its original parameter set - +yields surprisingly low drift in the Boreas dataset. Additionally, we discuss +an improved implementation and solving strategy that enables the most accurate +configuration to run in real-time with improved robustness, reaching as low as +0.61% translation drift at a frame rate of 68 Hz. A recent release of the +source code is available to the community +https://github.com/dan11003/CFEAR_Radarodometry_code_public, and we publish the +evaluation from this article on https://github.com/dan11003/cfear_2024_workshop + +
+
+ comment: Uppdated with results from test set in Boreas +
+
+
+
+
+ + ♻ ☆ ReCoRe: Regularized Contrastive Representation Learning of World Model CVPR 2024 + + +
+ While recent model-free Reinforcement Learning (RL) methods have demonstrated +human-level effectiveness in gaming environments, their success in everyday +tasks like visual navigation has been limited, particularly under significant +appearance variations. This limitation arises from (i) poor sample efficiency +and (ii) over-fitting to training scenarios. To address these challenges, we +present a world model that learns invariant features using (i) contrastive +unsupervised learning and (ii) an intervention-invariant regularizer. Learning +an explicit representation of the world dynamics i.e. a world model, improves +sample efficiency while contrastive learning implicitly enforces learning of +invariant features, which improves generalization. However, the na\"ive +integration of contrastive loss to world models is not good enough, as +world-model-based RL methods independently optimize representation learning and +agent policy. To overcome this issue, we propose an intervention-invariant +regularizer in the form of an auxiliary task such as depth prediction, image +denoising, image segmentation, etc., that explicitly enforces invariance to +style interventions. Our method outperforms current state-of-the-art +model-based and model-free RL methods and significantly improves on +out-of-distribution point navigation tasks evaluated on the iGibson benchmark. +With only visual observations, we further demonstrate that our approach +outperforms recent language-guided foundation models for point navigation, +which is essential for deployment on robots with limited computation +capabilities. Finally, we demonstrate that our proposed model excels at the +sim-to-real transfer of its perception module on the Gibson benchmark. + +
+
+ comment: Accepted at CVPR 2024. arXiv admin note: text overlap with + arXiv:2209.14932 +
+
+
+
+
+ + ♻ ☆ eWand: A calibration framework for wide baseline frame-based and + event-based camera systems ICRA 2024 + + +
+ Accurate calibration is crucial for using multiple cameras to triangulate the +position of objects precisely. However, it is also a time-consuming process +that needs to be repeated for every displacement of the cameras. The standard +approach is to use a printed pattern with known geometry to estimate the +intrinsic and extrinsic parameters of the cameras. The same idea can be applied +to event-based cameras, though it requires extra work. By using frame +reconstruction from events, a printed pattern can be detected. A blinking +pattern can also be displayed on a screen. Then, the pattern can be directly +detected from the events. Such calibration methods can provide accurate +intrinsic calibration for both frame- and event-based cameras. However, using +2D patterns has several limitations for multi-camera extrinsic calibration, +with cameras possessing highly different points of view and a wide baseline. +The 2D pattern can only be detected from one direction and needs to be of +significant size to compensate for its distance to the camera. This makes the +extrinsic calibration time-consuming and cumbersome. To overcome these +limitations, we propose eWand, a new method that uses blinking LEDs inside +opaque spheres instead of a printed or displayed pattern. Our method provides a +faster, easier-to-use extrinsic calibration approach that maintains high +accuracy for both event- and frame-based cameras. + +
+
+ comment: Accepted for 2024 IEEE International Conference on Robotics and + Automation (ICRA 2024). Project web page: + https://cogsys-tuebingen.github.io/ewand/ +
+
+
+
+
+ + ♻ ☆ Task-conditioned adaptation of visual features in multi-task policy + learning + + +
+ Successfully addressing a wide variety of tasks is a core ability of +autonomous agents, requiring flexibly adapting the underlying decision-making +strategies and, as we argue in this work, also adapting the perception modules. +An analogical argument would be the human visual system, which uses top-down +signals to focus attention determined by the current task. Similarly, we adapt +pre-trained large vision models conditioned on specific downstream tasks in the +context of multi-task policy learning. We introduce task-conditioned adapters +that do not require finetuning any pre-trained weights, combined with a single +policy trained with behavior cloning and capable of addressing multiple tasks. +We condition the visual adapters on task embeddings, which can be selected at +inference if the task is known, or alternatively inferred from a set of example +demonstrations. To this end, we propose a new optimization-based estimator. We +evaluate the method on a wide variety of tasks from the CortexBench benchmark +and show that, compared to existing work, it can be addressed with a single +policy. In particular, we demonstrate that adapting visual features is a key +design choice and that the method generalizes to unseen tasks given a few +demonstrations. + +
+
+
+
+
+ + ♻ ☆ Automatic Derivation of an Optimal Task Frame for Learning and + Controlling Contact-Rich Tasks + + +
+ This study investigates learning from demonstration (LfD) for contact-rich +tasks. The procedure for choosing a task frame to express the learned signals +for the motion and interaction wrench is often omitted or using expert insight. +This article presents a procedure to derive the optimal task frame from motion +and wrench data recorded during the demonstration. The procedure is based on +two principles that are hypothesized to underpin the control configuration +targeted by an expert, and assumes task frame origins and orientations that are +fixed to either the world or the robot tool. It is rooted in screw theory, is +entirely probabilistic and does not involve any hyperparameters. The procedure +was validated by demonstrating several tasks, including surface following and +manipulation of articulated objects, showing good agreement between the +obtained and the assumed expert task frames. To validate the performance of the +learned tasks by a UR10e robot, a constraint-based controller was designed +based on the derived task frames and the learned data expressed therein. These +experiments showed the effectiveness and versatility of the proposed approach. +The task frame derivation approach fills a gap in the state of the art of LfD, +bringing LfD for contact-rich tasks closer to practical application. + +
+
+
+
+
+ + ♻ ☆ Beyond Inverted Pendulums: Task-optimal Simple Models of Legged + Locomotion + + +
+ Reduced-order models (ROM) are popular in online motion planning due to their +simplicity. A good ROM for control captures critical task-relevant aspects of +the full dynamics while remaining low dimensional. However, planning within the +reduced-order space unavoidably constrains the full model, and hence we +sacrifice the full potential of the robot. In the community of legged +locomotion, this has lead to a search for better model extensions, but many of +these extensions require human intuition, and there has not existed a +principled way of evaluating the model performance and discovering new models. +In this work, we propose a model optimization algorithm that automatically +synthesizes reduced-order models, optimal with respect to a user-specified +distribution of tasks and corresponding cost functions. To demonstrate our +work, we optimized models for a bipedal robot Cassie. We show in simulation +that the optimal ROM reduces the cost of Cassie's joint torques by up to 23% +and increases its walking speed by up to 54%. We also show hardware result that +the real robot walks on flat ground with 10% lower torque cost. All videos and +code can be found at https://sites.google.com/view/ymchen/research/optimal-rom. + +
+
+
+
+
+ + ♻ ☆ Logical Specifications-guided Dynamic Task Sampling for Reinforcement + Learning Agents + + +
+ Reinforcement Learning (RL) has made significant strides in enabling +artificial agents to learn diverse behaviors. However, learning an effective +policy often requires a large number of environment interactions. To mitigate +sample complexity issues, recent approaches have used high-level task +specifications, such as Linear Temporal Logic (LTL$_f$) formulas or Reward +Machines (RM), to guide the learning progress of the agent. In this work, we +propose a novel approach, called Logical Specifications-guided Dynamic Task +Sampling (LSTS), that learns a set of RL policies to guide an agent from an +initial state to a goal state based on a high-level task specification, while +minimizing the number of environmental interactions. Unlike previous work, LSTS +does not assume information about the environment dynamics or the Reward +Machine, and dynamically samples promising tasks that lead to successful goal +policies. We evaluate LSTS on a gridworld and show that it achieves improved +time-to-threshold performance on complex sequential decision-making problems +compared to state-of-the-art RM and Automaton-guided RL baselines, such as +Q-Learning for Reward Machines and Compositional RL from logical Specifications +(DIRL). Moreover, we demonstrate that our method outperforms RM and +Automaton-guided RL baselines in terms of sample-efficiency, both in a +partially observable robotic task and in a continuous control robotic +manipulation task. + +
+
+
+
+
+ + ♻ ☆ SayNav: Grounding Large Language Models for Dynamic Planning to + Navigation in New Environments + + +
+ Semantic reasoning and dynamic planning capabilities are crucial for an +autonomous agent to perform complex navigation tasks in unknown environments. +It requires a large amount of common-sense knowledge, that humans possess, to +succeed in these tasks. We present SayNav, a new approach that leverages human +knowledge from Large Language Models (LLMs) for efficient generalization to +complex navigation tasks in unknown large-scale environments. SayNav uses a +novel grounding mechanism, that incrementally builds a 3D scene graph of the +explored environment as inputs to LLMs, for generating feasible and +contextually appropriate high-level plans for navigation. The LLM-generated +plan is then executed by a pre-trained low-level planner, that treats each +planned step as a short-distance point-goal navigation sub-task. SayNav +dynamically generates step-by-step instructions during navigation and +continuously refines future steps based on newly perceived information. We +evaluate SayNav on multi-object navigation (MultiON) task, that requires the +agent to utilize a massive amount of human knowledge to efficiently search +multiple different objects in an unknown environment. We also introduce a +benchmark dataset for MultiON task employing ProcTHOR framework that provides +large photo-realistic indoor environments with variety of objects. SayNav +achieves state-of-the-art results and even outperforms an oracle based baseline +with strong ground-truth assumptions by more than 8% in terms of success rate, +highlighting its ability to generate dynamic plans for successfully locating +objects in large-scale new environments. The code, benchmark dataset and +demonstration videos are accessible at +https://www.sri.com/ics/computer-vision/saynav. + +
+
+
+
+
+ + ♻ ☆ Simulation-based reinforcement learning for real-world autonomous + driving + + +
+ We use reinforcement learning in simulation to obtain a driving system +controlling a full-size real-world vehicle. The driving policy takes RGB images +from a single camera and their semantic segmentation as input. We use mostly +synthetic data, with labelled real-world data appearing only in the training of +the segmentation network. + Using reinforcement learning in simulation and synthetic data is motivated by +lowering costs and engineering effort. + In real-world experiments we confirm that we achieved successful sim-to-real +policy transfer. Based on the extensive evaluation, we analyze how design +decisions about perception, control, and training impact the real-world +performance. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 149 + +
+
+
+ + ☆ Visual Autoregressive Modeling: Scalable Image Generation via Next-Scale + Prediction + + +
+ We present Visual AutoRegressive modeling (VAR), a new generation paradigm +that redefines the autoregressive learning on images as coarse-to-fine +"next-scale prediction" or "next-resolution prediction", diverging from the +standard raster-scan "next-token prediction". This simple, intuitive +methodology allows autoregressive (AR) transformers to learn visual +distributions fast and generalize well: VAR, for the first time, makes AR +models surpass diffusion transformers in image generation. On ImageNet 256x256 +benchmark, VAR significantly improve AR baseline by improving Frechet inception +distance (FID) from 18.65 to 1.80, inception score (IS) from 80.4 to 356.4, +with around 20x faster inference speed. It is also empirically verified that +VAR outperforms the Diffusion Transformer (DiT) in multiple dimensions +including image quality, inference speed, data efficiency, and scalability. +Scaling up VAR models exhibits clear power-law scaling laws similar to those +observed in LLMs, with linear correlation coefficients near -0.998 as solid +evidence. VAR further showcases zero-shot generalization ability in downstream +tasks including image in-painting, out-painting, and editing. These results +suggest VAR has initially emulated the two important properties of LLMs: +Scaling Laws and zero-shot task generalization. We have released all models and +codes to promote the exploration of AR/VAR models for visual generation and +unified learning. + +
+
+
+
+
+ + ☆ ALOHa: A New Measure for Hallucination in Captioning Models + + +
+ Despite recent advances in multimodal pre-training for visual description, +state-of-the-art models still produce captions containing errors, such as +hallucinating objects not present in a scene. The existing prominent metric for +object hallucination, CHAIR, is limited to a fixed set of MS COCO objects and +synonyms. In this work, we propose a modernized open-vocabulary metric, ALOHa, +which leverages large language models (LLMs) to measure object hallucinations. +Specifically, we use an LLM to extract groundable objects from a candidate +caption, measure their semantic similarity to reference objects from captions +and object detections, and use Hungarian matching to produce a final +hallucination score. We show that ALOHa correctly identifies 13.6% more +hallucinated objects than CHAIR on HAT, a new gold-standard subset of MS COCO +Captions annotated for hallucinations, and 30.8% more on nocaps, where objects +extend beyond MS COCO categories. Our code is available at +https://davidmchan.github.io/aloha/. + +
+
+ comment: To appear at NAACL 2024 +
+
+
+
+
+ + ☆ LidarDM: Generative LiDAR Simulation in a Generated World + + +
+ We present LidarDM, a novel LiDAR generative model capable of producing +realistic, layout-aware, physically plausible, and temporally coherent LiDAR +videos. LidarDM stands out with two unprecedented capabilities in LiDAR +generative modeling: (i) LiDAR generation guided by driving scenarios, offering +significant potential for autonomous driving simulations, and (ii) 4D LiDAR +point cloud generation, enabling the creation of realistic and temporally +coherent sequences. At the heart of our model is a novel integrated 4D world +generation framework. Specifically, we employ latent diffusion models to +generate the 3D scene, combine it with dynamic actors to form the underlying 4D +world, and subsequently produce realistic sensory observations within this +virtual environment. Our experiments indicate that our approach outperforms +competing algorithms in realism, temporal coherency, and layout consistency. We +additionally show that LidarDM can be used as a generative world model +simulator for training and testing perception models. + +
+
+
+
+
+ + ☆ DeiT-LT Distillation Strikes Back for Vision Transformer Training on + Long-Tailed Datasets CVPR 2024 + + +
+ Vision Transformer (ViT) has emerged as a prominent architecture for various +computer vision tasks. In ViT, we divide the input image into patch tokens and +process them through a stack of self attention blocks. However, unlike +Convolutional Neural Networks (CNN), ViTs simple architecture has no +informative inductive bias (e.g., locality,etc. ). Due to this, ViT requires a +large amount of data for pre-training. Various data efficient approaches (DeiT) +have been proposed to train ViT on balanced datasets effectively. However, +limited literature discusses the use of ViT for datasets with long-tailed +imbalances. In this work, we introduce DeiT-LT to tackle the problem of +training ViTs from scratch on long-tailed datasets. In DeiT-LT, we introduce an +efficient and effective way of distillation from CNN via distillation DIST +token by using out-of-distribution images and re-weighting the distillation +loss to enhance focus on tail classes. This leads to the learning of local +CNN-like features in early ViT blocks, improving generalization for tail +classes. Further, to mitigate overfitting, we propose distilling from a flat +CNN teacher, which leads to learning low-rank generalizable features for DIST +tokens across all ViT blocks. With the proposed DeiT-LT scheme, the +distillation DIST token becomes an expert on the tail classes, and the +classifier CLS token becomes an expert on the head classes. The experts help to +effectively learn features corresponding to both the majority and minority +classes using a distinct set of tokens within the same ViT architecture. We +show the effectiveness of DeiT-LT for training ViT from scratch on datasets +ranging from small-scale CIFAR-10 LT to large-scale iNaturalist-2018. + +
+
+ comment: CVPR 2024. Project Page: https://rangwani-harsh.github.io/DeiT-LT +
+
+
+
+
+ + ☆ MatAtlas: Text-driven Consistent Geometry Texturing and Material + Assignment + + +
+ We present MatAtlas, a method for consistent text-guided 3D model texturing. +Following recent progress we leverage a large scale text-to-image generation +model (e.g., Stable Diffusion) as a prior to texture a 3D model. We carefully +design an RGB texturing pipeline that leverages a grid pattern diffusion, +driven by depth and edges. By proposing a multi-step texture refinement +process, we significantly improve the quality and 3D consistency of the +texturing output. To further address the problem of baked-in lighting, we move +beyond RGB colors and pursue assigning parametric materials to the assets. +Given the high-quality initial RGB texture, we propose a novel material +retrieval method capitalized on Large Language Models (LLM), enabling +editabiliy and relightability. We evaluate our method on a wide variety of +geometries and show that our method significantly outperform prior arts. We +also analyze the role of each component through a detailed ablation study. + +
+
+
+
+
+ + ☆ Deep Image Composition Meets Image Forgery + + +
+ Image forgery is a topic that has been studied for many years. Before the +breakthrough of deep learning, forged images were detected using handcrafted +features that did not require training. These traditional methods failed to +perform satisfactorily even on datasets much worse in quality than real-life +image manipulations. Advances in deep learning have impacted image forgery +detection as much as they have impacted other areas of computer vision and have +improved the state of the art. Deep learning models require large amounts of +labeled data for training. In the case of image forgery, labeled data at the +pixel level is a very important factor for the models to learn. None of the +existing datasets have sufficient size, realism and pixel-level labeling at the +same time. This is due to the high cost of producing and labeling quality +images. It can take hours for an image editing expert to manipulate just one +image. To bridge this gap, we automate data generation using image composition +techniques that are very related to image forgery. Unlike other automated data +generation frameworks, we use state of the art image composition deep learning +models to generate spliced images close to the quality of real-life +manipulations. Finally, we test the generated dataset on the SOTA image +manipulation detection model and show that its prediction performance is lower +compared to existing datasets, i.e. we produce realistic images that are more +difficult to detect. Dataset will be available at +https://github.com/99eren99/DIS25k . + +
+
+
+
+
+ + ☆ Steganographic Passport: An Owner and User Verifiable Credential for + Deep Model IP Protection Without Retraining + + +
+ Ensuring the legal usage of deep models is crucial to promoting trustable, +accountable, and responsible artificial intelligence innovation. Current +passport-based methods that obfuscate model functionality for license-to-use +and ownership verifications suffer from capacity and quality constraints, as +they require retraining the owner model for new users. They are also vulnerable +to advanced Expanded Residual Block ambiguity attacks. We propose +Steganographic Passport, which uses an invertible steganographic network to +decouple license-to-use from ownership verification by hiding the user's +identity images into the owner-side passport and recovering them from their +respective user-side passports. An irreversible and collision-resistant hash +function is used to avoid exposing the owner-side passport from the derived +user-side passports and increase the uniqueness of the model signature. To +safeguard both the passport and model's weights against advanced ambiguity +attacks, an activation-level obfuscation is proposed for the verification +branch of the owner's model. By jointly training the verification and +deployment branches, their weights become tightly coupled. The proposed method +supports agile licensing of deep models by providing a strong ownership proof +and license accountability without requiring a separate model retraining for +the admission of every new user. Experiment results show that our +Steganographic Passport outperforms other passport-based deep model protection +methods in robustness against various known attacks. + +
+
+
+
+
+ + ☆ PoCo: Point Context Cluster for RGBD Indoor Place Recognition + + +
+ We present a novel end-to-end algorithm (PoCo) for the indoor RGB-D place +recognition task, aimed at identifying the most likely match for a given query +frame within a reference database. The task presents inherent challenges +attributed to the constrained field of view and limited range of perception +sensors. We propose a new network architecture, which generalizes the recent +Context of Clusters (CoCs) to extract global descriptors directly from the +noisy point clouds through end-to-end learning. Moreover, we develop the +architecture by integrating both color and geometric modalities into the point +features to enhance the global descriptor representation. We conducted +evaluations on public datasets ScanNet-PR and ARKit with 807 and 5047 +scenarios, respectively. PoCo achieves SOTA performance: on ScanNet-PR, we +achieve R@1 of 64.63%, a 5.7% improvement from the best-published result CGis +(61.12%); on Arkit, we achieve R@1 of 45.12%, a 13.3% improvement from the +best-published result CGis (39.82%). In addition, PoCo shows higher efficiency +than CGis in inference time (1.75X-faster), and we demonstrate the +effectiveness of PoCo in recognizing places within a real-world laboratory +environment. + +
+
+
+
+
+ + ☆ On the Scalability of Diffusion-based Text-to-Image Generation CVPR2024 + + +
+ Scaling up model and data size has been quite successful for the evolution of +LLMs. However, the scaling law for the diffusion based text-to-image (T2I) +models is not fully explored. It is also unclear how to efficiently scale the +model for better performance at reduced cost. The different training settings +and expensive training cost make a fair model comparison extremely difficult. +In this work, we empirically study the scaling properties of diffusion based +T2I models by performing extensive and rigours ablations on scaling both +denoising backbones and training set, including training scaled UNet and +Transformer variants ranging from 0.4B to 4B parameters on datasets upto 600M +images. For model scaling, we find the location and amount of cross attention +distinguishes the performance of existing UNet designs. And increasing the +transformer blocks is more parameter-efficient for improving text-image +alignment than increasing channel numbers. We then identify an efficient UNet +variant, which is 45% smaller and 28% faster than SDXL's UNet. On the data +scaling side, we show the quality and diversity of the training set matters +more than simply dataset size. Increasing caption density and diversity +improves text-image alignment performance and the learning efficiency. Finally, +we provide scaling functions to predict the text-image alignment performance as +functions of the scale of model size, compute and dataset size. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ FlightScope: A Deep Comprehensive Assessment of Aircraft Detection + Algorithms in Satellite Imagery + + +
+ Object detection in remotely sensed satellite pictures is fundamental in many +fields such as biophysical, and environmental monitoring. While deep learning +algorithms are constantly evolving, they have been mostly implemented and +tested on popular ground-based taken photos. This paper critically evaluates +and compares a suite of advanced object detection algorithms customized for the +task of identifying aircraft within satellite imagery. Using the large +HRPlanesV2 dataset, together with a rigorous validation with the GDIT dataset, +this research encompasses an array of methodologies including YOLO versions 5 +and 8, Faster RCNN, CenterNet, RetinaNet, RTMDet, and DETR, all trained from +scratch. This exhaustive training and validation study reveal YOLOv5 as the +preeminent model for the specific case of identifying airplanes from remote +sensing data, showcasing high precision and adaptability across diverse imaging +conditions. This research highlight the nuanced performance landscapes of these +algorithms, with YOLOv5 emerging as a robust solution for aerial object +detection, underlining its importance through superior mean average precision, +Recall, and Intersection over Union scores. The findings described here +underscore the fundamental role of algorithm selection aligned with the +specific demands of satellite imagery analysis and extend a comprehensive +framework to evaluate model efficacy. The benchmark toolkit and codes, +available via https://github.com/toelt-llc/FlightScope_Bench, aims to further +exploration and innovation in the realm of remote sensing object detection, +paving the way for improved analytical methodologies in satellite imagery +applications. + +
+
+ comment: 15 figures, 4 tables, comprehensive survey, comparative study +
+
+
+
+
+ + ☆ Cross-Modal Conditioned Reconstruction for Language-guided Medical Image + Segmentation + + +
+ Recent developments underscore the potential of textual information in +enhancing learning models for a deeper understanding of medical visual +semantics. However, language-guided medical image segmentation still faces a +challenging issue. Previous works employ implicit and ambiguous architectures +to embed textual information. This leads to segmentation results that are +inconsistent with the semantics represented by the language, sometimes even +diverging significantly. To this end, we propose a novel cross-modal +conditioned Reconstruction for Language-guided Medical Image Segmentation +(RecLMIS) to explicitly capture cross-modal interactions, which assumes that +well-aligned medical visual features and medical notes can effectively +reconstruct each other. We introduce conditioned interaction to adaptively +predict patches and words of interest. Subsequently, they are utilized as +conditioning factors for mutual reconstruction to align with regions described +in the medical notes. Extensive experiments demonstrate the superiority of our +RecLMIS, surpassing LViT by 3.74% mIoU on the publicly available MosMedData+ +dataset and achieving an average increase of 1.89% mIoU for cross-domain tests +on our QATA-CoV19 dataset. Simultaneously, we achieve a relative reduction of +20.2% in parameter count and a 55.5% decrease in computational load. The code +will be available at https://github.com/ShashankHuang/RecLMIS. + +
+
+
+
+
+ + ☆ Enhancing Interpretability of Vertebrae Fracture Grading using + Human-interpretable Prototypes + + +
+ Vertebral fracture grading classifies the severity of vertebral fractures, +which is a challenging task in medical imaging and has recently attracted Deep +Learning (DL) models. Only a few works attempted to make such models +human-interpretable despite the need for transparency and trustworthiness in +critical use cases like DL-assisted medical diagnosis. Moreover, such models +either rely on post-hoc methods or additional annotations. In this work, we +propose a novel interpretable-by-design method, ProtoVerse, to find relevant +sub-parts of vertebral fractures (prototypes) that reliably explain the model's +decision in a human-understandable way. Specifically, we introduce a novel +diversity-promoting loss to mitigate prototype repetitions in small datasets +with intricate semantics. We have experimented with the VerSe'19 dataset and +outperformed the existing prototype-based method. Further, our model provides +superior interpretability against the post-hoc method. Importantly, expert +radiologists validated the visual interpretability of our results, showing +clinical applicability. + +
+
+
+
+
+ + ☆ GPU-Accelerated RSF Level Set Evolution for Large-Scale Microvascular + Segmentation + + +
+ Microvascular networks are challenging to model because these structures are +currently near the diffraction limit for most advanced three-dimensional +imaging modalities, including confocal and light sheet microscopy. This makes +semantic segmentation difficult, because individual components of these +networks fluctuate within the confines of individual pixels. Level set methods +are ideally suited to solve this problem by providing surface and topological +constraints on the resulting model, however these active contour techniques are +extremely time intensive and impractical for terabyte-scale images. We propose +a reformulation and implementation of the region-scalable fitting (RSF) level +set model that makes it amenable to three-dimensional evaluation using both +single-instruction multiple data (SIMD) and single-program multiple-data (SPMD) +parallel processing. This enables evaluation of the level set equation on +independent regions of the data set using graphics processing units (GPUs), +making large-scale segmentation of high-resolution networks practical and +inexpensive. + We tested this 3D parallel RSF approach on multiple data sets acquired using +state-of-the-art imaging techniques to acquire microvascular data, including +micro-CT, light sheet fluorescence microscopy (LSFM) and milling microscopy. To +assess the performance and accuracy of the RSF model, we conducted a +Monte-Carlo-based validation technique to compare results to other segmentation +methods. We also provide a rigorous profiling to show the gains in processing +speed leveraging parallel hardware. This study showcases the practical +application of the RSF model, emphasizing its utility in the challenging domain +of segmenting large-scale high-topology network structures with a particular +focus on building microvascular models. + +
+
+
+
+
+ + ☆ MULAN: A Multi Layer Annotated Dataset for Controllable Text-to-Image + Generation CVPR 2024 + + +
+ Text-to-image generation has achieved astonishing results, yet precise +spatial controllability and prompt fidelity remain highly challenging. This +limitation is typically addressed through cumbersome prompt engineering, scene +layout conditioning, or image editing techniques which often require hand drawn +masks. Nonetheless, pre-existing works struggle to take advantage of the +natural instance-level compositionality of scenes due to the typically flat +nature of rasterized RGB output images. Towards adressing this challenge, we +introduce MuLAn: a novel dataset comprising over 44K MUlti-Layer ANnotations of +RGB images as multilayer, instance-wise RGBA decompositions, and over 100K +instance images. To build MuLAn, we developed a training free pipeline which +decomposes a monocular RGB image into a stack of RGBA layers comprising of +background and isolated instances. We achieve this through the use of +pretrained general-purpose models, and by developing three modules: image +decomposition for instance discovery and extraction, instance completion to +reconstruct occluded areas, and image re-assembly. We use our pipeline to +create MuLAn-COCO and MuLAn-LAION datasets, which contain a variety of image +decompositions in terms of style, composition and complexity. With MuLAn, we +provide the first photorealistic resource providing instance decomposition and +occlusion information for high quality images, opening up new avenues for +text-to-image generative AI research. With this, we aim to encourage the +development of novel generation and editing technology, in particular +layer-wise solutions. MuLAn data resources are available at +https://MuLAn-dataset.github.io/. + +
+
+ comment: CVPR 2024 - Project page: https://MuLAn-dataset.github.io/ +
+
+
+
+
+ + ☆ GenN2N: Generative NeRF2NeRF Translation CVPR 2024 + + +
+ We present GenN2N, a unified NeRF-to-NeRF translation framework for various +NeRF translation tasks such as text-driven NeRF editing, colorization, +super-resolution, inpainting, etc. Unlike previous methods designed for +individual translation tasks with task-specific schemes, GenN2N achieves all +these NeRF editing tasks by employing a plug-and-play image-to-image translator +to perform editing in the 2D domain and lifting 2D edits into the 3D NeRF +space. Since the 3D consistency of 2D edits may not be assured, we propose to +model the distribution of the underlying 3D edits through a generative model +that can cover all possible edited NeRFs. To model the distribution of 3D +edited NeRFs from 2D edited images, we carefully design a VAE-GAN that encodes +images while decoding NeRFs. The latent space is trained to align with a +Gaussian distribution and the NeRFs are supervised through an adversarial loss +on its renderings. To ensure the latent code does not depend on 2D viewpoints +but truly reflects the 3D edits, we also regularize the latent code through a +contrastive learning scheme. Extensive experiments on various editing tasks +show GenN2N, as a universal framework, performs as well or better than +task-specific specialists while possessing flexible generative power. More +results on our project page: https://xiangyueliu.github.io/GenN2N/ + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://xiangyueliu.github.io/GenN2N/ +
+
+
+
+
+ + ☆ Domain Generalization through Meta-Learning: A Survey + + +
+ Deep neural networks (DNNs) have revolutionized artificial intelligence but +often lack performance when faced with out-of-distribution (OOD) data, a common +scenario due to the inevitable domain shifts in real-world applications. This +limitation stems from the common assumption that training and testing data +share the same distribution-an assumption frequently violated in practice. +Despite their effectiveness with large amounts of data and computational power, +DNNs struggle with distributional shifts and limited labeled data, leading to +overfitting and poor generalization across various tasks and domains. +Meta-learning presents a promising approach by employing algorithms that +acquire transferable knowledge across various tasks for fast adaptation, +eliminating the need to learn each task from scratch. This survey paper delves +into the realm of meta-learning with a focus on its contribution to domain +generalization. We first clarify the concept of meta-learning for domain +generalization and introduce a novel taxonomy based on the feature extraction +strategy and the classifier learning methodology, offering a granular view of +methodologies. Through an exhaustive review of existing methods and underlying +theories, we map out the fundamentals of the field. Our survey provides +practical insights and an informed discussion on promising research directions, +paving the way for future innovation in meta-learning for domain +generalization. + +
+
+
+
+
+ + ☆ Unsupervised Occupancy Learning from Sparse Point Cloud CVPR 2024 + + +
+ Implicit Neural Representations have gained prominence as a powerful +framework for capturing complex data modalities, encompassing a wide range from +3D shapes to images and audio. Within the realm of 3D shape representation, +Neural Signed Distance Functions (SDF) have demonstrated remarkable potential +in faithfully encoding intricate shape geometry. However, learning SDFs from 3D +point clouds in the absence of ground truth supervision remains a very +challenging task. In this paper, we propose a method to infer occupancy fields +instead of SDFs as they are easier to learn from sparse inputs. We leverage a +margin-based uncertainty measure to differentially sample from the decision +boundary of the occupancy function and supervise the sampled boundary points +using the input point cloud. We further stabilize the optimization process at +the early stages of the training by biasing the occupancy function towards +minimal entropy fields while maximizing its entropy at the input point cloud. +Through extensive experiments and evaluations, we illustrate the efficacy of +our proposed method, highlighting its capacity to improve implicit shape +inference with respect to baselines and the state-of-the-art using synthetic +and real data. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ DIBS: Enhancing Dense Video Captioning with Unlabeled Videos via Pseudo + Boundary Enrichment and Online Refinement CVPR 2024 + + +
+ We present Dive Into the BoundarieS (DIBS), a novel pretraining framework for +dense video captioning (DVC), that elaborates on improving the quality of the +generated event captions and their associated pseudo event boundaries from +unlabeled videos. By leveraging the capabilities of diverse large language +models (LLMs), we generate rich DVC-oriented caption candidates and optimize +the corresponding pseudo boundaries under several meticulously designed +objectives, considering diversity, event-centricity, temporal ordering, and +coherence. Moreover, we further introduce a novel online boundary refinement +strategy that iteratively improves the quality of pseudo boundaries during +training. Comprehensive experiments have been conducted to examine the +effectiveness of the proposed technique components. By leveraging a substantial +amount of unlabeled video data, such as HowTo100M, we achieve a remarkable +advancement on standard DVC datasets like YouCook2 and ActivityNet. We +outperform the previous state-of-the-art Vid2Seq across a majority of metrics, +achieving this with just 0.4% of the unlabeled video data used for pre-training +by Vid2Seq. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Cross-Attention Makes Inference Cumbersome in Text-to-Image Diffusion + Models + + +
+ This study explores the role of cross-attention during inference in +text-conditional diffusion models. We find that cross-attention outputs +converge to a fixed point after few inference steps. Accordingly, the time +point of convergence naturally divides the entire inference process into two +stages: an initial semantics-planning stage, during which, the model relies on +cross-attention to plan text-oriented visual semantics, and a subsequent +fidelity-improving stage, during which the model tries to generate images from +previously planned semantics. Surprisingly, ignoring text conditions in the +fidelity-improving stage not only reduces computation complexity, but also +maintains model performance. This yields a simple and training-free method +called TGATE for efficient generation, which caches the cross-attention output +once it converges and keeps it fixed during the remaining inference steps. Our +empirical study on the MS-COCO validation set confirms its effectiveness. The +source code of TGATE is available at https://github.com/HaozheLiu-ST/T-GATE. + +
+
+
+
+
+ + ☆ LiDAR4D: Dynamic Neural Fields for Novel Space-time View LiDAR Synthesis CVPR 2024 + + +
+ Although neural radiance fields (NeRFs) have achieved triumphs in image novel +view synthesis (NVS), LiDAR NVS remains largely unexplored. Previous LiDAR NVS +methods employ a simple shift from image NVS methods while ignoring the dynamic +nature and the large-scale reconstruction problem of LiDAR point clouds. In +light of this, we propose LiDAR4D, a differentiable LiDAR-only framework for +novel space-time LiDAR view synthesis. In consideration of the sparsity and +large-scale characteristics, we design a 4D hybrid representation combined with +multi-planar and grid features to achieve effective reconstruction in a +coarse-to-fine manner. Furthermore, we introduce geometric constraints derived +from point clouds to improve temporal consistency. For the realistic synthesis +of LiDAR point clouds, we incorporate the global optimization of ray-drop +probability to preserve cross-region patterns. Extensive experiments on +KITTI-360 and NuScenes datasets demonstrate the superiority of our method in +accomplishing geometry-aware and time-consistent dynamic reconstruction. Codes +are available at https://github.com/ispc-lab/LiDAR4D. + +
+
+ comment: Accepted by CVPR 2024. Project Page: + https://dyfcalid.github.io/LiDAR4D +
+
+
+
+
+ + ☆ Adaptive Affinity-Based Generalization For MRI Imaging Segmentation + Across Resource-Limited Settings + + +
+ The joint utilization of diverse data sources for medical imaging +segmentation has emerged as a crucial area of research, aiming to address +challenges such as data heterogeneity, domain shift, and data quality +discrepancies. Integrating information from multiple data domains has shown +promise in improving model generalizability and adaptability. However, this +approach often demands substantial computational resources, hindering its +practicality. In response, knowledge distillation (KD) has garnered attention +as a solution. KD involves training light-weight models to emulate the behavior +of more resource-intensive models, thereby mitigating the computational burden +while maintaining performance. This paper addresses the pressing need to +develop a lightweight and generalizable model for medical imaging segmentation +that can effectively handle data integration challenges. Our proposed approach +introduces a novel relation-based knowledge framework by seamlessly combining +adaptive affinity-based and kernel-based distillation through a gram matrix +that can capture the style representation across features. This methodology +empowers the student model to accurately replicate the feature representations +of the teacher model, facilitating robust performance even in the face of +domain shift and data heterogeneity. To validate our innovative approach, we +conducted experiments on publicly available multi-source prostate MRI data. The +results demonstrate a significant enhancement in segmentation performance using +lightweight networks. Notably, our method achieves this improvement while +reducing both inference time and storage usage, rendering it a practical and +efficient solution for real-time medical imaging segmentation. + +
+
+
+
+
+ + ☆ InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image + Generation + + +
+ Tuning-free diffusion-based models have demonstrated significant potential in +the realm of image personalization and customization. However, despite this +notable progress, current models continue to grapple with several complex +challenges in producing style-consistent image generation. Firstly, the concept +of style is inherently underdetermined, encompassing a multitude of elements +such as color, material, atmosphere, design, and structure, among others. +Secondly, inversion-based methods are prone to style degradation, often +resulting in the loss of fine-grained details. Lastly, adapter-based approaches +frequently require meticulous weight tuning for each reference image to achieve +a balance between style intensity and text controllability. In this paper, we +commence by examining several compelling yet frequently overlooked +observations. We then proceed to introduce InstantStyle, a framework designed +to address these issues through the implementation of two key strategies: 1) A +straightforward mechanism that decouples style and content from reference +images within the feature space, predicated on the assumption that features +within the same space can be either added to or subtracted from one another. 2) +The injection of reference image features exclusively into style-specific +blocks, thereby preventing style leaks and eschewing the need for cumbersome +weight tuning, which often characterizes more parameter-heavy designs.Our work +demonstrates superior visual stylization outcomes, striking an optimal balance +between the intensity of style and the controllability of textual elements. Our +codes will be available at https://github.com/InstantStyle/InstantStyle. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Event Camera Demosaicing via Swin Transformer and Pixel-focus Loss CVPR 2024 + + +
+ Recent research has highlighted improvements in high-quality imaging guided +by event cameras, with most of these efforts concentrating on the RGB domain. +However, these advancements frequently neglect the unique challenges introduced +by the inherent flaws in the sensor design of event cameras in the RAW domain. +Specifically, this sensor design results in the partial loss of pixel values, +posing new challenges for RAW domain processes like demosaicing. The challenge +intensifies as most research in the RAW domain is based on the premise that +each pixel contains a value, making the straightforward adaptation of these +methods to event camera demosaicing problematic. To end this, we present a +Swin-Transformer-based backbone and a pixel-focus loss function for demosaicing +with missing pixel values in RAW domain processing. Our core motivation is to +refine a general and widely applicable foundational model from the RGB domain +for RAW domain processing, thereby broadening the model's applicability within +the entire imaging process. Our method harnesses multi-scale processing and +space-to-depth techniques to ensure efficiency and reduce computing complexity. +We also proposed the Pixel-focus Loss function for network fine-tuning to +improve network convergence based on our discovery of a long-tailed +distribution in training loss. Our method has undergone validation on the MIPI +Demosaic Challenge dataset, with subsequent analytical experimentation +confirming its efficacy. All code and trained models are released here: +https://github.com/yunfanLu/ev-demosaic + +
+
+ comment: Accepted for the CVPR 2024 Workshop on Mobile Intelligent Photography + & Imaging +
+
+
+
+
+ + ☆ Harnessing the Power of Large Vision Language Models for Synthetic Image + Detection + + +
+ In recent years, the emergence of models capable of generating images from +text has attracted considerable interest, offering the possibility of creating +realistic images from text descriptions. Yet these advances have also raised +concerns about the potential misuse of these images, including the creation of +misleading content such as fake news and propaganda. This study investigates +the effectiveness of using advanced vision-language models (VLMs) for synthetic +image identification. Specifically, the focus is on tuning state-of-the-art +image captioning models for synthetic image detection. By harnessing the robust +understanding capabilities of large VLMs, the aim is to distinguish authentic +images from synthetic images produced by diffusion-based models. This study +contributes to the advancement of synthetic image detection by exploiting the +capabilities of visual language models such as BLIP-2 and ViTGPT2. By tailoring +image captioning models, we address the challenges associated with the +potential misuse of synthetic images in real-world applications. Results +described in this paper highlight the promising role of VLMs in the field of +synthetic image detection, outperforming conventional image-based detection +techniques. Code and models can be found at +https://github.com/Mamadou-Keita/VLM-DETECT. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2404.01959 +
+
+
+
+
+ + ☆ Model-agnostic Origin Attribution of Generated Images with Few-shot + Examples + + +
+ Recent progress in visual generative models enables the generation of +high-quality images. To prevent the misuse of generated images, it is important +to identify the origin model that generates them. In this work, we study the +origin attribution of generated images in a practical setting where only a few +images generated by a source model are available and the source model cannot be +accessed. The goal is to check if a given image is generated by the source +model. We first formulate this problem as a few-shot one-class classification +task. To solve the task, we propose OCC-CLIP, a CLIP-based framework for +few-shot one-class classification, enabling the identification of an image's +source model, even among multiple candidates. Extensive experiments +corresponding to various generative models verify the effectiveness of our +OCC-CLIP framework. Furthermore, an experiment based on the recently released +DALL-E 3 API verifies the real-world applicability of our solution. + +
+
+
+
+
+ + ☆ Design2Cloth: 3D Cloth Generation from 2D Masks CVPR 2024 + + +
+ In recent years, there has been a significant shift in the field of digital +avatar research, towards modeling, animating and reconstructing clothed human +representations, as a key step towards creating realistic avatars. However, +current 3D cloth generation methods are garment specific or trained completely +on synthetic data, hence lacking fine details and realism. In this work, we +make a step towards automatic realistic garment design and propose +Design2Cloth, a high fidelity 3D generative model trained on a real world +dataset from more than 2000 subject scans. To provide vital contribution to the +fashion industry, we developed a user-friendly adversarial model capable of +generating diverse and detailed clothes simply by drawing a 2D cloth mask. +Under a series of both qualitative and quantitative experiments, we showcase +that Design2Cloth outperforms current state-of-the-art cloth generative models +by a large margin. In addition to the generative properties of our network, we +showcase that the proposed method can be used to achieve high quality +reconstructions from single in-the-wild images and 3D scans. Dataset, code and +pre-trained model will become publicly available. + +
+
+ comment: Accepted to CVPR 2024, Project page: + https://jiali-zheng.github.io/Design2Cloth/ +
+
+
+
+
+ + ☆ Independently Keypoint Learning for Small Object Semantic Correspondence + + +
+ Semantic correspondence remains a challenging task for establishing +correspondences between a pair of images with the same category or similar +scenes due to the large intra-class appearance. In this paper, we introduce a +novel problem called 'Small Object Semantic Correspondence (SOSC).' This +problem is challenging due to the close proximity of keypoints associated with +small objects, which results in the fusion of these respective features. It is +difficult to identify the corresponding key points of the fused features, and +it is also difficult to be recognized. To address this challenge, we propose +the Keypoint Bounding box-centered Cropping (KBC) method, which aims to +increase the spatial separation between keypoints of small objects, thereby +facilitating independent learning of these keypoints. The KBC method is +seamlessly integrated into our proposed inference pipeline and can be easily +incorporated into other methodologies, resulting in significant performance +enhancements. Additionally, we introduce a novel framework, named KBCNet, which +serves as our baseline model. KBCNet comprises a Cross-Scale Feature Alignment +(CSFA) module and an efficient 4D convolutional decoder. The CSFA module is +designed to align multi-scale features, enriching keypoint representations by +integrating fine-grained features and deep semantic features. Meanwhile, the 4D +convolutional decoder, based on efficient 4D convolution, ensures efficiency +and rapid convergence. To empirically validate the effectiveness of our +proposed methodology, extensive experiments are conducted on three widely used +benchmarks: PF-PASCAL, PF-WILLOW, and SPair-71k. Our KBC method demonstrates a +substantial performance improvement of 7.5\% on the SPair-71K dataset, +providing compelling evidence of its efficacy. + +
+
+
+
+
+ + ☆ RS-Mamba for Large Remote Sensing Image Dense Prediction + + +
+ The spatial resolution of remote sensing images is becoming increasingly +higher, posing challenges in handling large very-high-resolution (VHR) remote +sensing images for dense prediction tasks. Models based on convolutional neural +networks are limited in their ability to model global features of remote +sensing images due to local convolution operations. Transformer based models, +despite their global modeling capabilities, face computational challenges with +large VHR images due to their quadratic complexity. The common practice of +cropping large images into smaller patches leads to a significant loss of +contextual information. To address these issues, we propose the Remote Sensing +Mamba (RSM) for dense prediction tasks in VHR remote sensing. RSM is designed +to model global features of remote sensing images with linear complexity, +enabling it to process large VHR images effectively. It employs an +omnidirectional selective scan module to globally model the images in multiple +directions, capturing large spatial features from various directions. +Experiments on semantic segmentation and change detection tasks across various +objects demonstrate the effectiveness of RSM. With simple model architecture +and training approach, RSM achieves state-of-the-art performance on the dense +prediction tasks of VHR remote sensing. The code for this work will be +available at https://github.com/walking-shadow/Official_Remote_Sensing_Mamba. + +
+
+ comment: 13 pages,6 figures +
+
+
+
+
+ + ☆ A Satellite Band Selection Framework for Amazon Forest Deforestation + Detection Task + + +
+ The conservation of tropical forests is a topic of significant social and +ecological relevance due to their crucial role in the global ecosystem. +Unfortunately, deforestation and degradation impact millions of hectares +annually, necessitating government or private initiatives for effective forest +monitoring. This study introduces a novel framework that employs the Univariate +Marginal Distribution Algorithm (UMDA) to select spectral bands from Landsat-8 +satellite, optimizing the representation of deforested areas. This selection +guides a semantic segmentation architecture, DeepLabv3+, enhancing its +performance. Experimental results revealed several band compositions that +achieved superior balanced accuracy compared to commonly adopted combinations +for deforestation detection, utilizing segment classification via a Support +Vector Machine (SVM). Moreover, the optimal band compositions identified by the +UMDA-based approach improved the performance of the DeepLabv3+ architecture, +surpassing state-of-the-art approaches compared in this study. The observation +that a few selected bands outperform the total contradicts the data-driven +paradigm prevalent in the deep learning field. Therefore, this suggests an +exception to the conventional wisdom that 'more is always better'. + +
+
+ comment: 9 pages, 4 figures, paper accepted for presentation at GECCO 2024 +
+
+
+
+
+ + ☆ Non-negative Subspace Feature Representation for Few-shot Learning in + Medical Imaging + + +
+ Unlike typical visual scene recognition domains, in which massive datasets +are accessible to deep neural networks, medical image interpretations are often +obstructed by the paucity of data. In this paper, we investigate the +effectiveness of data-based few-shot learning in medical imaging by exploring +different data attribute representations in a low-dimensional space. We +introduce different types of non-negative matrix factorization (NMF) in +few-shot learning, addressing the data scarcity issue in medical image +classification. Extensive empirical studies are conducted in terms of +validating the effectiveness of NMF, especially its supervised variants (e.g., +discriminative NMF, and supervised and constrained NMF with sparseness), and +the comparison with principal component analysis (PCA), i.e., the collaborative +representation-based dimensionality reduction technique derived from +eigenvectors. With 14 different datasets covering 11 distinct illness +categories, thorough experimental results and comparison with related +techniques demonstrate that NMF is a competitive alternative to PCA for +few-shot learning in medical imaging, and the supervised NMF algorithms are +more discriminative in the subspace with greater effectiveness. Furthermore, we +show that the part-based representation of NMF, especially its supervised +variants, is dramatically impactful in detecting lesion areas in medical +imaging with limited samples. + +
+
+
+
+
+ + ☆ SG-BEV: Satellite-Guided BEV Fusion for Cross-View Semantic Segmentation CVPR 2024 + + +
+ This paper aims at achieving fine-grained building attribute segmentation in +a cross-view scenario, i.e., using satellite and street-view image pairs. The +main challenge lies in overcoming the significant perspective differences +between street views and satellite views. In this work, we introduce SG-BEV, a +novel approach for satellite-guided BEV fusion for cross-view semantic +segmentation. To overcome the limitations of existing cross-view projection +methods in capturing the complete building facade features, we innovatively +incorporate Bird's Eye View (BEV) method to establish a spatially explicit +mapping of street-view features. Moreover, we fully leverage the advantages of +multiple perspectives by introducing a novel satellite-guided reprojection +module, optimizing the uneven feature distribution issues associated with +traditional BEV methods. Our method demonstrates significant improvements on +four cross-view datasets collected from multiple cities, including New York, +San Francisco, and Boston. On average across these datasets, our method +achieves an increase in mIOU by 10.13% and 5.21% compared with the +state-of-the-art satellite-based and cross-view methods. The code and datasets +of this work will be released at https://github.com/yejy53/SG-BEV. + +
+
+ comment: accepted by CVPR 2024 +
+
+
+
+
+ + ☆ 3DStyleGLIP: Part-Tailored Text-Guided 3D Neural Stylization + + +
+ 3D stylization, which entails the application of specific styles to +three-dimensional objects, holds significant commercial potential as it enables +the creation of diverse 3D objects with distinct moods and styles, tailored to +specific demands of different scenes. With recent advancements in text-driven +methods and artificial intelligence, the stylization process is increasingly +intuitive and automated, thereby diminishing the reliance on manual labor and +expertise. However, existing methods have predominantly focused on holistic +stylization, thereby leaving the application of styles to individual components +of a 3D object unexplored. In response, we introduce 3DStyleGLIP, a novel +framework specifically designed for text-driven, part-tailored 3D stylization. +Given a 3D mesh and a text prompt, 3DStyleGLIP leverages the vision-language +embedding space of the Grounded Language-Image Pre-training (GLIP) model to +localize the individual parts of the 3D mesh and modify their colors and local +geometries to align them with the desired styles specified in the text prompt. +3DStyleGLIP is effectively trained for 3D stylization tasks through a +part-level style loss working in GLIP's embedding space, supplemented by two +complementary learning techniques. Extensive experimental validation confirms +that our method achieves significant part-wise stylization capabilities, +demonstrating promising potential in advancing the field of 3D stylization. + +
+
+
+
+
+ + ☆ Multi-Scale Spatial-Temporal Self-Attention Graph Convolutional Networks + for Skeleton-based Action Recognition + + +
+ Skeleton-based gesture recognition methods have achieved high success using +Graph Convolutional Network (GCN). In addition, context-dependent adaptive +topology as a neighborhood vertex information and attention mechanism leverages +a model to better represent actions. In this paper, we propose self-attention +GCN hybrid model, Multi-Scale Spatial-Temporal self-attention (MSST)-GCN to +effectively improve modeling ability to achieve state-of-the-art results on +several datasets. We utilize spatial self-attention module with adaptive +topology to understand intra-frame interactions within a frame among different +body parts, and temporal self-attention module to examine correlations between +frames of a node. These two are followed by multi-scale convolution network +with dilations, which not only captures the long-range temporal dependencies of +joints but also the long-range spatial dependencies (i.e., long-distance +dependencies) of node temporal behaviors. They are combined into high-level +spatial-temporal representations and output the predicted action with the +softmax classifier. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ Diffexplainer: Towards Cross-modal Global Explanations with Diffusion + Models + + +
+ We present DiffExplainer, a novel framework that, leveraging language-vision +models, enables multimodal global explainability. DiffExplainer employs +diffusion models conditioned on optimized text prompts, synthesizing images +that maximize class outputs and hidden features of a classifier, thus providing +a visual tool for explaining decisions. Moreover, the analysis of generated +visual descriptions allows for automatic identification of biases and spurious +features, as opposed to traditional methods that often rely on manual +intervention. The cross-modal transferability of language-vision models also +enables the possibility to describe decisions in a more human-interpretable +way, i.e., through text. We conduct comprehensive experiments, which include an +extensive user study, demonstrating the effectiveness of DiffExplainer on 1) +the generation of high-quality images explaining model decisions, surpassing +existing activation maximization methods, and 2) the automated identification +of biases and spurious features. + +
+
+
+
+
+ + ☆ Neural Radiance Fields with Torch Units + + +
+ Neural Radiance Fields (NeRF) give rise to learning-based 3D reconstruction +methods widely used in industrial applications. Although prevalent methods +achieve considerable improvements in small-scale scenes, accomplishing +reconstruction in complex and large-scale scenes is still challenging. First, +the background in complex scenes shows a large variance among different views. +Second, the current inference pattern, $i.e.$, a pixel only relies on an +individual camera ray, fails to capture contextual information. To solve these +problems, we propose to enlarge the ray perception field and build up the +sample points interactions. In this paper, we design a novel inference pattern +that encourages a single camera ray possessing more contextual information, and +models the relationship among sample points on each camera ray. To hold +contextual information,a camera ray in our proposed method can render a patch +of pixels simultaneously. Moreover, we replace the MLP in neural radiance field +models with distance-aware convolutions to enhance the feature propagation +among sample points from the same camera ray. To summarize, as a torchlight, a +ray in our proposed method achieves rendering a patch of image. Thus, we call +the proposed method, Torch-NeRF. Extensive experiments on KITTI-360 and LLFF +show that the Torch-NeRF exhibits excellent performance. + +
+
+
+
+
+ + ☆ Vestibular schwannoma growth_prediction from longitudinal MRI by time + conditioned neural fields + + +
+ Vestibular schwannomas (VS) are benign tumors that are generally managed by +active surveillance with MRI examination. To further assist clinical +decision-making and avoid overtreatment, an accurate prediction of tumor growth +based on longitudinal imaging is highly desirable. In this paper, we introduce +DeepGrowth, a deep learning method that incorporates neural fields and +recurrent neural networks for prospective tumor growth prediction. In the +proposed method, each tumor is represented as a signed distance function (SDF) +conditioned on a low-dimensional latent code. Unlike previous studies that +perform tumor shape prediction directly in the image space, we predict the +latent codes instead and then reconstruct future shapes from it. To deal with +irregular time intervals, we introduce a time-conditioned recurrent module +based on a ConvLSTM and a novel temporal encoding strategy, which enables the +proposed model to output varying tumor shapes over time. The experiments on an +in-house longitudinal VS dataset showed that the proposed model significantly +improved the performance ($\ge 1.6\%$ Dice score and $\ge0.20$ mm 95\% +Hausdorff distance), in particular for top 20\% tumors that grow or shrink the +most ($\ge 4.6\%$ Dice score and $\ge 0.73$ mm 95\% Hausdorff distance). Our +code is available at ~\burl{https://github.com/cyjdswx/DeepGrowth} + +
+
+
+
+
+ + ☆ Unsegment Anything by Simulating Deformation CVPR 2024 + + +
+ Foundation segmentation models, while powerful, pose a significant risk: they +enable users to effortlessly extract any objects from any digital content with +a single click, potentially leading to copyright infringement or malicious +misuse. To mitigate this risk, we introduce a new task "Anything Unsegmentable" +to grant any image "the right to be unsegmented". The ambitious pursuit of the +task is to achieve highly transferable adversarial attacks against all +prompt-based segmentation models, regardless of model parameterizations and +prompts. We highlight the non-transferable and heterogeneous nature of +prompt-specific adversarial noises. Our approach focuses on disrupting image +encoder features to achieve prompt-agnostic attacks. Intriguingly, targeted +feature attacks exhibit better transferability compared to untargeted ones, +suggesting the optimal update direction aligns with the image manifold. Based +on the observations, we design a novel attack named Unsegment Anything by +Simulating Deformation (UAD). Our attack optimizes a differentiable deformation +function to create a target deformed image, which alters structural information +while preserving achievable feature distance by adversarial example. Extensive +experiments verify the effectiveness of our approach, compromising a variety of +promptable segmentation models with different architectures and prompt +interfaces. We release the code at +https://github.com/jiahaolu97/anything-unsegmentable. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Active learning for efficient annotation in precision agriculture: a + use-case on crop-weed semantic segmentation + + +
+ Optimizing deep learning models requires large amounts of annotated images, a +process that is both time-intensive and costly. Especially for semantic +segmentation models in which every pixel must be annotated. A potential +strategy to mitigate annotation effort is active learning. Active learning +facilitates the identification and selection of the most informative images +from a large unlabelled pool. The underlying premise is that these selected +images can improve the model's performance faster than random selection to +reduce annotation effort. While active learning has demonstrated promising +results on benchmark datasets like Cityscapes, its performance in the +agricultural domain remains largely unexplored. This study addresses this +research gap by conducting a comparative study of three active learning-based +acquisition functions: Bayesian Active Learning by Disagreement (BALD), +stochastic-based BALD (PowerBALD), and Random. The acquisition functions were +tested on two agricultural datasets: Sugarbeet and Corn-Weed, both containing +three semantic classes: background, crop and weed. Our results indicated that +active learning, especially PowerBALD, yields a higher performance than Random +sampling on both datasets. But due to the relatively large standard deviations, +the differences observed were minimal; this was partly caused by high image +redundancy and imbalanced classes. Specifically, more than 89\% of the pixels +belonged to the background class on both datasets. The absence of significant +results on both datasets indicates that further research is required for +applying active learning on agricultural datasets, especially if they contain a +high-class imbalance and redundant images. Recommendations and insights are +provided in this paper to potentially resolve such issues. + +
+
+
+
+
+ + ☆ Knowledge Distillation with Multi-granularity Mixture of Priors for + Image Super-Resolution + + +
+ Knowledge distillation (KD) is a promising yet challenging model compression +technique that transfers rich learning representations from a well-performing +but cumbersome teacher model to a compact student model. Previous methods for +image super-resolution (SR) mostly compare the feature maps directly or after +standardizing the dimensions with basic algebraic operations (e.g. average, +dot-product). However, the intrinsic semantic differences among feature maps +are overlooked, which are caused by the disparate expressive capacity between +the networks. This work presents MiPKD, a multi-granularity mixture of prior KD +framework, to facilitate efficient SR model through the feature mixture in a +unified latent space and stochastic network block mixture. Extensive +experiments demonstrate the effectiveness of the proposed MiPKD method. + +
+
+
+
+
+ + ☆ Representation Alignment Contrastive Regularization for Multi-Object + Tracking + + +
+ Achieving high-performance in multi-object tracking algorithms heavily relies +on modeling spatio-temporal relationships during the data association stage. +Mainstream approaches encompass rule-based and deep learning-based methods for +spatio-temporal relationship modeling. While the former relies on physical +motion laws, offering wider applicability but yielding suboptimal results for +complex object movements, the latter, though achieving high-performance, lacks +interpretability and involves complex module designs. This work aims to +simplify deep learning-based spatio-temporal relationship models and introduce +interpretability into features for data association. Specifically, a +lightweight single-layer transformer encoder is utilized to model +spatio-temporal relationships. To make features more interpretative, two +contrastive regularization losses based on representation alignment are +proposed, derived from spatio-temporal consistency rules. By applying weighted +summation to affinity matrices, the aligned features can seamlessly integrate +into the data association stage of the original tracking workflow. Experimental +results showcase that our model enhances the majority of existing tracking +networks' performance without excessive complexity, with minimal increase in +training overhead and nearly negligible computational and storage costs. + +
+
+
+
+
+ + ☆ Regional biases in image geolocation estimation: a case study with the + SenseCity Africa dataset + + +
+ Advances in Artificial Intelligence are challenged by the biases rooted in +the datasets used to train the models. In image geolocation estimation, models +are mostly trained using data from specific geographic regions, notably the +Western world, and as a result, they may struggle to comprehend the +complexities of underrepresented regions. To assess this issue, we apply a +state-of-the-art image geolocation estimation model (ISNs) to a crowd-sourced +dataset of geolocated images from the African continent (SCA100), and then +explore the regional and socioeconomic biases underlying the model's +predictions. Our findings show that the ISNs model tends to over-predict image +locations in high-income countries of the Western world, which is consistent +with the geographic distribution of its training data, i.e., the IM2GPS3k +dataset. Accordingly, when compared to the IM2GPS3k benchmark, the accuracy of +the ISNs model notably decreases at all scales. Additionally, we cluster images +of the SCA100 dataset based on how accurately they are predicted by the ISNs +model and show the model's difficulties in correctly predicting the locations +of images in low income regions, especially in Sub-Saharan Africa. Therefore, +our results suggest that using IM2GPS3k as a training set and benchmark for +image geolocation estimation and other computer vision models overlooks its +potential application in the African context. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ Semi-Supervised Unconstrained Head Pose Estimation in the Wild + + +
+ Existing head pose estimation datasets are either composed of numerous +samples by non-realistic synthesis or lab collection, or limited images by +labor-intensive annotating. This makes deep supervised learning based solutions +compromised due to the reliance on generous labeled data. To alleviate it, we +propose the first semi-supervised unconstrained head pose estimation (SemiUHPE) +method, which can leverage a large amount of unlabeled wild head images. +Specifically, we follow the recent semi-supervised rotation regression, and +focus on the diverse and complex head pose domain. Firstly, we claim that the +aspect-ratio invariant cropping of heads is superior to the previous +landmark-based affine alignment, which does not fit unlabeled natural heads or +practical applications where landmarks are often unavailable. Then, instead of +using an empirically fixed threshold to filter out pseudo labels, we propose +the dynamic entropy-based filtering by updating thresholds for adaptively +removing unlabeled outliers. Moreover, we revisit the design of weak-strong +augmentations, and further exploit its superiority by devising two novel +head-oriented strong augmentations named pose-irrelevant cut-occlusion and +pose-altering rotation consistency. Extensive experiments show that SemiUHPE +can surpass SOTAs with remarkable improvements on public benchmarks under both +front-range and full-range. Our code is released in +\url{https://github.com/hnuzhy/SemiUHPE}. + +
+
+ comment: 14 pages. Semi-Supervised Unconstrained Head Pose Estimation +
+
+
+
+
+ + ☆ Severity Controlled Text-to-Image Generative Model Bias Manipulation + + +
+ Text-to-image (T2I) generative models are gaining wide popularity, especially +in public domains. However, their intrinsic bias and potential malicious +manipulations remain under-explored. Charting the susceptibility of T2I models +to such manipulation, we first expose the new possibility of a dynamic and +computationally efficient exploitation of model bias by targeting the embedded +language models. By leveraging mathematical foundations of vector algebra, our +technique enables a scalable and convenient control over the severity of output +manipulation through model bias. As a by-product, this control also allows a +form of precise prompt engineering to generate images which are generally +implausible with regular text prompts. We also demonstrate a constructive +application of our manipulation for balancing the frequency of generated +classes - as in model debiasing. Our technique does not require training and is +also framed as a backdoor attack with severity control using semantically-null +text triggers in the prompts. With extensive analysis, we present interesting +qualitative and quantitative results to expose potential manipulation +possibilities for T2I models. + Key-words: Text-to-Image Models, Generative Models, Backdoor Attacks, Prompt +Engineering, Bias + +
+
+ comment: This research was supported by National Intelligence and Security + Discovery Research Grants (project# NS220100007), funded by the Department of + Defence Australia +
+
+
+
+
+ + ☆ Weakly-Supervised 3D Scene Graph Generation via Visual-Linguistic + Assisted Pseudo-labeling + + +
+ Learning to build 3D scene graphs is essential for real-world perception in a +structured and rich fashion. However, previous 3D scene graph generation +methods utilize a fully supervised learning manner and require a large amount +of entity-level annotation data of objects and relations, which is extremely +resource-consuming and tedious to obtain. To tackle this problem, we propose +3D-VLAP, a weakly-supervised 3D scene graph generation method via +Visual-Linguistic Assisted Pseudo-labeling. Specifically, our 3D-VLAP exploits +the superior ability of current large-scale visual-linguistic models to align +the semantics between texts and 2D images, as well as the naturally existing +correspondences between 2D images and 3D point clouds, and thus implicitly +constructs correspondences between texts and 3D point clouds. First, we +establish the positional correspondence from 3D point clouds to 2D images via +camera intrinsic and extrinsic parameters, thereby achieving alignment of 3D +point clouds and 2D images. Subsequently, a large-scale cross-modal +visual-linguistic model is employed to indirectly align 3D instances with the +textual category labels of objects by matching 2D images with object category +labels. The pseudo labels for objects and relations are then produced for +3D-VLAP model training by calculating the similarity between visual embeddings +and textual category embeddings of objects and relations encoded by the +visual-linguistic model, respectively. Ultimately, we design an edge +self-attention based graph neural network to generate scene graphs of 3D point +cloud scenes. Extensive experiments demonstrate that our 3D-VLAP achieves +comparable results with current advanced fully supervised methods, meanwhile +significantly alleviating the pressure of data annotation. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ☆ Text-driven Affordance Learning from Egocentric Vision + + +
+ Visual affordance learning is a key component for robots to understand how to +interact with objects. Conventional approaches in this field rely on +pre-defined objects and actions, falling short of capturing diverse +interactions in realworld scenarios. The key idea of our approach is employing +textual instruction, targeting various affordances for a wide range of objects. +This approach covers both hand-object and tool-object interactions. We +introduce text-driven affordance learning, aiming to learn contact points and +manipulation trajectories from an egocentric view following textual +instruction. In our task, contact points are represented as heatmaps, and the +manipulation trajectory as sequences of coordinates that incorporate both +linear and rotational movements for various manipulations. However, when we +gather data for this task, manual annotations of these diverse interactions are +costly. To this end, we propose a pseudo dataset creation pipeline and build a +large pseudo-training dataset: TextAFF80K, consisting of over 80K instances of +the contact points, trajectories, images, and text tuples. We extend existing +referring expression comprehension models for our task, and experimental +results show that our approach robustly handles multiple affordances, serving +as a new standard for affordance learning in real-world scenarios. + +
+
+
+
+
+ + ☆ CPAISD: Core-penumbra acute ischemic stroke dataset + + +
+ We introduce the CPAISD: Core-Penumbra Acute Ischemic Stroke Dataset, aimed +at enhancing the early detection and segmentation of ischemic stroke using +Non-Contrast Computed Tomography (NCCT) scans. Addressing the challenges in +diagnosing acute ischemic stroke during its early stages due to often +non-revealing native CT findings, the dataset provides a collection of +segmented NCCT images. These include annotations of ischemic core and penumbra +regions, critical for developing machine learning models for rapid stroke +identification and assessment. By offering a carefully collected and annotated +dataset, we aim to facilitate the development of advanced diagnostic tools, +contributing to improved patient care and outcomes in stroke management. Our +dataset's uniqueness lies in its focus on the acute phase of ischemic stroke, +with non-informative native CT scans, and includes a baseline model to +demonstrate the dataset's application, encouraging further research and +innovation in the field of medical imaging and stroke diagnosis. + +
+
+
+
+
+ + ☆ HENet: Hybrid Encoding for End-to-end Multi-task 3D Perception from + Multi-view Cameras + + +
+ Three-dimensional perception from multi-view cameras is a crucial component +in autonomous driving systems, which involves multiple tasks like 3D object +detection and bird's-eye-view (BEV) semantic segmentation. To improve +perception precision, large image encoders, high-resolution images, and +long-term temporal inputs have been adopted in recent 3D perception models, +bringing remarkable performance gains. However, these techniques are often +incompatible in training and inference scenarios due to computational resource +constraints. Besides, modern autonomous driving systems prefer to adopt an +end-to-end framework for multi-task 3D perception, which can simplify the +overall system architecture and reduce the implementation complexity. However, +conflict between tasks often arises when optimizing multiple tasks jointly +within an end-to-end 3D perception model. To alleviate these issues, we present +an end-to-end framework named HENet for multi-task 3D perception in this paper. +Specifically, we propose a hybrid image encoding network, using a large image +encoder for short-term frames and a small image encoder for long-term temporal +frames. Then, we introduce a temporal feature integration module based on the +attention mechanism to fuse the features of different frames extracted by the +two aforementioned hybrid image encoders. Finally, according to the +characteristics of each perception task, we utilize BEV features of different +grid sizes, independent BEV encoders, and task decoders for different tasks. +Experimental results show that HENet achieves state-of-the-art end-to-end +multi-task 3D perception results on the nuScenes benchmark, including 3D object +detection and BEV semantic segmentation. The source code and models will be +released at https://github.com/VDIGPKU/HENet. + +
+
+
+
+
+ + ☆ Freditor: High-Fidelity and Transferable NeRF Editing by Frequency + Decomposition + + +
+ This paper enables high-fidelity, transferable NeRF editing by frequency +decomposition. Recent NeRF editing pipelines lift 2D stylization results to 3D +scenes while suffering from blurry results, and fail to capture detailed +structures caused by the inconsistency between 2D editings. Our critical +insight is that low-frequency components of images are more +multiview-consistent after editing compared with their high-frequency parts. +Moreover, the appearance style is mainly exhibited on the low-frequency +components, and the content details especially reside in high-frequency parts. +This motivates us to perform editing on low-frequency components, which results +in high-fidelity edited scenes. In addition, the editing is performed in the +low-frequency feature space, enabling stable intensity control and novel scene +transfer. Comprehensive experiments conducted on photorealistic datasets +demonstrate the superior performance of high-fidelity and transferable NeRF +editing. The project page is at \url{https://aigc3d.github.io/freditor}. + +
+
+
+
+
+ + ☆ VIAssist: Adapting Multi-modal Large Language Models for Users with + Visual Impairments + + +
+ Individuals with visual impairments, encompassing both partial and total +difficulties in visual perception, are referred to as visually impaired (VI) +people. An estimated 2.2 billion individuals worldwide are affected by visual +impairments. Recent advancements in multi-modal large language models (MLLMs) +have showcased their extraordinary capabilities across various domains. It is +desirable to help VI individuals with MLLMs' great capabilities of visual +understanding and reasoning. However, it is challenging for VI people to use +MLLMs due to the difficulties in capturing the desirable images to fulfill +their daily requests. For example, the target object is not fully or partially +placed in the image. This paper explores how to leverage MLLMs for VI +individuals to provide visual-question answers. VIAssist can identify undesired +images and provide detailed actions. Finally, VIAssist can provide reliable +answers to users' queries based on the images. Our results show that VIAssist +provides +0.21 and +0.31 higher BERTScore and ROUGE scores than the baseline, +respectively. + +
+
+ comment: Accepted to IEEE International Workshop on Foundation Models for + Cyber-Physical Systems & Internet of Things (FMSys 2024) +
+
+
+
+
+ + ☆ A Unified Membership Inference Method for Visual Self-supervised Encoder + via Part-aware Capability + + +
+ Self-supervised learning shows promise in harnessing extensive unlabeled +data, but it also confronts significant privacy concerns, especially in vision. +In this paper, we aim to perform membership inference on visual self-supervised +models in a more realistic setting: self-supervised training method and details +are unknown for an adversary when attacking as he usually faces a black-box +system in practice. In this setting, considering that self-supervised model +could be trained by completely different self-supervised paradigms, e.g., +masked image modeling and contrastive learning, with complex training details, +we propose a unified membership inference method called PartCrop. It is +motivated by the shared part-aware capability among models and stronger part +response on the training data. Specifically, PartCrop crops parts of objects in +an image to query responses with the image in representation space. We conduct +extensive attacks on self-supervised models with different training protocols +and structures using three widely used image datasets. The results verify the +effectiveness and generalization of PartCrop. Moreover, to defend against +PartCrop, we evaluate two common approaches, i.e., early stop and differential +privacy, and propose a tailored method called shrinking crop scale range. The +defense experiments indicate that all of them are effective. Our code is +available at https://github.com/JiePKU/PartCrop + +
+
+ comment: Membership Inference, Self-supervised learning +
+
+
+
+
+ + ☆ TSNet:A Two-stage Network for Image Dehazing with Multi-scale Fusion and + Adaptive Learning + + +
+ Image dehazing has been a popular topic of research for a long time. Previous +deep learning-based image dehazing methods have failed to achieve satisfactory +dehazing effects on both synthetic datasets and real-world datasets, exhibiting +poor generalization. Moreover, single-stage networks often result in many +regions with artifacts and color distortion in output images. To address these +issues, this paper proposes a two-stage image dehazing network called TSNet, +mainly consisting of the multi-scale fusion module (MSFM) and the adaptive +learning module (ALM). Specifically, MSFM and ALM enhance the generalization of +TSNet. The MSFM can obtain large receptive fields at multiple scales and +integrate features at different frequencies to reduce the differences between +inputs and learning objectives. The ALM can actively learn of regions of +interest in images and restore texture details more effectively. Additionally, +TSNet is designed as a two-stage network, where the first-stage network +performs image dehazing, and the second-stage network is employed to improve +issues such as artifacts and color distortion present in the results of the +first-stage network. We also change the learning objective from ground truth +images to opposite fog maps, which improves the learning efficiency of TSNet. +Extensive experiments demonstrate that TSNet exhibits superior dehazing +performance on both synthetic and real-world datasets compared to previous +state-of-the-art methods. + +
+
+ comment: 12 pages, 10 figures, 7 tables +
+
+
+
+
+ + ☆ RS3Mamba: Visual State Space Model for Remote Sensing Images Semantic + Segmentation + + +
+ Semantic segmentation of remote sensing images is a fundamental task in +geoscience research. However, there are some significant shortcomings for the +widely used convolutional neural networks (CNNs) and Transformers. The former +is limited by its insufficient long-range modeling capabilities, while the +latter is hampered by its computational complexity. Recently, a novel visual +state space (VSS) model represented by Mamba has emerged, capable of modeling +long-range relationships with linear computability. In this work, we propose a +novel dual-branch network named remote sensing images semantic segmentation +Mamba (RS3Mamba) to incorporate this innovative technology into remote sensing +tasks. Specifically, RS3Mamba utilizes VSS blocks to construct an auxiliary +branch, providing additional global information to convolution-based main +branch. Moreover, considering the distinct characteristics of the two branches, +we introduce a collaborative completion module (CCM) to enhance and fuse +features from the dual-encoder. Experimental results on two widely used +datasets, ISPRS Vaihingen and LoveDA Urban, demonstrate the effectiveness and +potential of the proposed RS3Mamba. To the best of our knowledge, this is the +first vision Mamba specifically designed for remote sensing images semantic +segmentation. The source code will be made available at +https://github.com/sstary/SSRS. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ A Novel Approach to Breast Cancer Histopathological Image Classification + Using Cross-Colour Space Feature Fusion and Quantum-Classical Stack Ensemble + Method + + +
+ Breast cancer classification stands as a pivotal pillar in ensuring timely +diagnosis and effective treatment. This study with histopathological images +underscores the profound significance of harnessing the synergistic +capabilities of colour space ensembling and quantum-classical stacking to +elevate the precision of breast cancer classification. By delving into the +distinct colour spaces of RGB, HSV and CIE L*u*v, the authors initiated a +comprehensive investigation guided by advanced methodologies. Employing the +DenseNet121 architecture for feature extraction the authors have capitalized on +the robustness of Random Forest, SVM, QSVC, and VQC classifiers. This research +encompasses a unique feature fusion technique within the colour space ensemble. +This approach not only deepens our comprehension of breast cancer +classification but also marks a milestone in personalized medical assessment. +The amalgamation of quantum and classical classifiers through stacking emerges +as a potent catalyst, effectively mitigating the inherent constraints of +individual classifiers, paving a robust path towards more dependable and +refined breast cancer identification. Through rigorous experimentation and +meticulous analysis, fusion of colour spaces like RGB with HSV and RGB with CIE +L*u*v, presents an classification accuracy, nearing the value of unity. This +underscores the transformative potential of our approach, where the fusion of +diverse colour spaces and the synergy of quantum and classical realms converge +to establish a new horizon in medical diagnostics. Thus the implications of +this research extend across medical disciplines, offering promising avenues for +advancing diagnostic accuracy and treatment efficacy. + +
+
+
+
+
+ + ☆ RESSA: Repair Sparse Vision-Language Models via Sparse Cross-Modality + Adaptation + + +
+ Vision-Language Models (VLMs), integrating diverse information from multiple +modalities, have shown remarkable success across various tasks. However, +deploying VLMs, comprising large-scale vision and language models poses +challenges in resource-constrained scenarios. While pruning followed by +finetuning offers a potential solution to maintain performance with smaller +model sizes, its application to VLMs remains relatively unexplored, presenting +two main questions: how to distribute sparsity across different +modality-specific models, and how to repair the performance of pruned sparse +VLMs. To answer the first question, we conducted preliminary studies on VLM +pruning and found that pruning vision models and language models with the same +sparsity ratios contribute to nearly optimal performance. For the second +question, unlike finetuning unimodal sparse models, sparse VLMs involve +cross-modality interactions, requiring specialized techniques for post-pruning +performance repair. Moreover, while parameter-efficient LoRA finetuning has +been proposed to repair the performance of sparse models, a significant +challenge of weights merging arises due to the incompatibility of dense LoRA +modules with sparse models that destroy the sparsity of pruned models. To +tackle these challenges, we propose to Repair Sparse Vision-Language Models via +Sparse Cross-modality Adaptation (RESSA). RESSA utilizes cross-modality +finetuning to enhance task-specific performance and facilitate knowledge +distillation from original dense models. Additionally, we introduce SparseLoRA, +which applies sparsity directly to LoRA weights, enabling seamless integration +with sparse models. Our experimental results validate the effectiveness of +RESSA, showcasing significant enhancements, such as an 11.3\% improvement under +2:4 sparsity and a remarkable 47.6\% enhancement under unstructured 70\% +sparsity. + +
+
+
+
+
+ + ☆ What Are We Measuring When We Evaluate Large Vision-Language Models? An + Analysis of Latent Factors and Biases + + +
+ Vision-language (VL) models, pretrained on colossal image-text datasets, have +attained broad VL competence that is difficult to evaluate. A common belief is +that a small number of VL skills underlie the variety of VL tests. In this +paper, we perform a large-scale transfer learning experiment aimed at +discovering latent VL skills from data. We reveal interesting characteristics +that have important implications for test suite design. First, generation tasks +suffer from a length bias, suggesting benchmarks should balance tasks with +varying output lengths. Second, we demonstrate that factor analysis +successfully identifies reasonable yet surprising VL skill factors, suggesting +benchmarks could leverage similar analyses for task selection. Finally, we +present a new dataset, OLIVE (https://github.com/jq-zh/olive-dataset), which +simulates user instructions in the wild and presents challenges dissimilar to +all datasets we tested. Our findings contribute to the design of balanced and +broad-coverage vision-language evaluation methods. + +
+
+
+
+
+ + ☆ TCLC-GS: Tightly Coupled LiDAR-Camera Gaussian Splatting for Surrounding + Autonomous Driving Scenes + + +
+ Most 3D Gaussian Splatting (3D-GS) based methods for urban scenes initialize +3D Gaussians directly with 3D LiDAR points, which not only underutilizes LiDAR +data capabilities but also overlooks the potential advantages of fusing LiDAR +with camera data. In this paper, we design a novel tightly coupled LiDAR-Camera +Gaussian Splatting (TCLC-GS) to fully leverage the combined strengths of both +LiDAR and camera sensors, enabling rapid, high-quality 3D reconstruction and +novel view RGB/depth synthesis. TCLC-GS designs a hybrid explicit (colorized 3D +mesh) and implicit (hierarchical octree feature) 3D representation derived from +LiDAR-camera data, to enrich the properties of 3D Gaussians for splatting. 3D +Gaussian's properties are not only initialized in alignment with the 3D mesh +which provides more completed 3D shape and color information, but are also +endowed with broader contextual information through retrieved octree implicit +features. During the Gaussian Splatting optimization process, the 3D mesh +offers dense depth information as supervision, which enhances the training +process by learning of a robust geometry. Comprehensive evaluations conducted +on the Waymo Open Dataset and nuScenes Dataset validate our method's +state-of-the-art (SOTA) performance. Utilizing a single NVIDIA RTX 3090 Ti, our +method demonstrates fast training and achieves real-time RGB and depth +rendering at 90 FPS in resolution of 1920x1280 (Waymo), and 120 FPS in +resolution of 1600x900 (nuScenes) in urban scenarios. + +
+
+
+
+
+ + ☆ TE-TAD: Towards Full End-to-End Temporal Action Detection via + Time-Aligned Coordinate Expression + + +
+ In this paper, we investigate that the normalized coordinate expression is a +key factor as reliance on hand-crafted components in query-based detectors for +temporal action detection (TAD). Despite significant advancements towards an +end-to-end framework in object detection, query-based detectors have been +limited in achieving full end-to-end modeling in TAD. To address this issue, we +propose \modelname{}, a full end-to-end temporal action detection transformer +that integrates time-aligned coordinate expression. We reformulate coordinate +expression utilizing actual timeline values, ensuring length-invariant +representations from the extremely diverse video duration environment. +Furthermore, our proposed adaptive query selection dynamically adjusts the +number of queries based on video length, providing a suitable solution for +varying video durations compared to a fixed query set. Our approach not only +simplifies the TAD process by eliminating the need for hand-crafted components +but also significantly improves the performance of query-based detectors. Our +TE-TAD outperforms the previous query-based detectors and achieves competitive +performance compared to state-of-the-art methods on popular benchmark datasets. +Code is available at: https://github.com/Dotori-HJ/TE-TAD + +
+
+
+
+
+ + ☆ Enhancing Diffusion-based Point Cloud Generation with Smoothness + Constraint + + +
+ Diffusion models have been popular for point cloud generation tasks. Existing +works utilize the forward diffusion process to convert the original point +distribution into a noise distribution and then learn the reverse diffusion +process to recover the point distribution from the noise distribution. However, +the reverse diffusion process can produce samples with non-smooth points on the +surface because of the ignorance of the point cloud geometric properties. We +propose alleviating the problem by incorporating the local smoothness +constraint into the diffusion framework for point cloud generation. Experiments +demonstrate the proposed model can generate realistic shapes and smoother point +clouds, outperforming multiple state-of-the-art methods. + +
+
+
+
+
+ + ☆ Cohort-Individual Cooperative Learning for Multimodal Cancer Survival + Analysis + + +
+ Recently, we have witnessed impressive achievements in cancer survival +analysis by integrating multimodal data, e.g., pathology images and genomic +profiles. However, the heterogeneity and high dimensionality of these +modalities pose significant challenges for extracting discriminative +representations while maintaining good generalization. In this paper, we +propose a Cohort-individual Cooperative Learning (CCL) framework to advance +cancer survival analysis by collaborating knowledge decomposition and cohort +guidance. Specifically, first, we propose a Multimodal Knowledge Decomposition +(MKD) module to explicitly decompose multimodal knowledge into four distinct +components: redundancy, synergy and uniqueness of the two modalities. Such a +comprehensive decomposition can enlighten the models to perceive easily +overlooked yet important information, facilitating an effective multimodal +fusion. Second, we propose a Cohort Guidance Modeling (CGM) to mitigate the +risk of overfitting task-irrelevant information. It can promote a more +comprehensive and robust understanding of the underlying multimodal data, while +avoiding the pitfalls of overfitting and enhancing the generalization ability +of the model. By cooperating the knowledge decomposition and cohort guidance +methods, we develop a robust multimodal survival analysis model with enhanced +discrimination and generalization abilities. Extensive experimental results on +five cancer datasets demonstrate the effectiveness of our model in integrating +multimodal data for survival analysis. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ☆ APC2Mesh: Bridging the gap from occluded building façades to full 3D + models + + +
+ The benefits of having digital twins of urban buildings are numerous. +However, a major difficulty encountered in their creation from airborne LiDAR +point clouds is the effective means of accurately reconstructing significant +occlusions amidst point density variations and noise. To bridge the +noise/sparsity/occlusion gap and generate high fidelity 3D building models, we +propose APC2Mesh which integrates point completion into a 3D reconstruction +pipeline, enabling the learning of dense geometrically accurate representation +of buildings. Specifically, we leveraged complete points generated from +occluded ones as input to a linearized skip attention-based deformation network +for 3D mesh reconstruction. In our experiments, conducted on 3 different +scenes, we demonstrate that: (1) APC2Mesh delivers comparatively superior +results, indicating its efficacy in handling the challenges of occluded +airborne building points of diverse styles and complexities. (2) The +combination of point completion with typical deep learning-based 3D point cloud +reconstruction methods offers a direct and effective solution for +reconstructing significantly occluded airborne building points. As such, this +neural integration holds promise for advancing the creation of digital twins +for urban buildings with greater accuracy and fidelity. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ CAPE: CAM as a Probabilistic Ensemble for Enhanced DNN Interpretation + + +
+ Deep Neural Networks (DNNs) are widely used for visual classification tasks, +but their complex computation process and black-box nature hinder decision +transparency and interpretability. Class activation maps (CAMs) and recent +variants provide ways to visually explain the DNN decision-making process by +displaying 'attention' heatmaps of the DNNs. Nevertheless, the CAM explanation +only offers relative attention information, that is, on an attention heatmap, +we can interpret which image region is more or less important than the others. +However, these regions cannot be meaningfully compared across classes, and the +contribution of each region to the model's class prediction is not revealed. To +address these challenges that ultimately lead to better DNN Interpretation, in +this paper, we propose CAPE, a novel reformulation of CAM that provides a +unified and probabilistically meaningful assessment of the contributions of +image regions. We quantitatively and qualitatively compare CAPE with +state-of-the-art CAM methods on CUB and ImageNet benchmark datasets to +demonstrate enhanced interpretability. We also test on a cytology imaging +dataset depicting a challenging Chronic Myelomonocytic Leukemia (CMML) +diagnosis problem. Code is available at: https://github.com/AIML-MED/CAPE. + +
+
+
+
+
+ + ☆ Enhancing Human-Computer Interaction in Chest X-ray Analysis using + Vision and Language Model with Eye Gaze Patterns + + +
+ Recent advancements in Computer Assisted Diagnosis have shown promising +performance in medical imaging tasks, particularly in chest X-ray analysis. +However, the interaction between these models and radiologists has been +primarily limited to input images. This work proposes a novel approach to +enhance human-computer interaction in chest X-ray analysis using +Vision-Language Models (VLMs) enhanced with radiologists' attention by +incorporating eye gaze data alongside textual prompts. Our approach leverages +heatmaps generated from eye gaze data, overlaying them onto medical images to +highlight areas of intense radiologist's focus during chest X-ray evaluation. +We evaluate this methodology in tasks such as visual question answering, chest +X-ray report automation, error detection, and differential diagnosis. Our +results demonstrate the inclusion of eye gaze information significantly +enhances the accuracy of chest X-ray analysis. Also, the impact of eye gaze on +fine-tuning was confirmed as it outperformed other medical VLMs in all tasks +except visual question answering. This work marks the potential of leveraging +both the VLM's capabilities and the radiologist's domain knowledge to improve +the capabilities of AI models in medical imaging, paving a novel way for +Computer Assisted Diagnosis with a human-centred AI. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Utilizing Computer Vision for Continuous Monitoring of Vaccine Side + Effects in Experimental Mice + + +
+ The demand for improved efficiency and accuracy in vaccine safety assessments +is increasing. Here, we explore the application of computer vision technologies +to automate the monitoring of experimental mice for potential side effects +after vaccine administration. Traditional observation methods are +labor-intensive and lack the capability for continuous monitoring. By deploying +a computer vision system, our research aims to improve the efficiency and +accuracy of vaccine safety assessments. The methodology involves training +machine learning models on annotated video data of mice behaviors pre- and +post-vaccination. Preliminary results indicate that computer vision effectively +identify subtle changes, signaling possible side effects. Therefore, our +approach has the potential to significantly enhance the monitoring process in +vaccine trials in animals, providing a practical solution to the limitations of +human observation. + +
+
+ comment: 1 figure +
+
+
+
+
+ + ☆ LVLM-Intrepret: An Interpretability Tool for Large Vision-Language + Models + + +
+ In the rapidly evolving landscape of artificial intelligence, multi-modal +large language models are emerging as a significant area of interest. These +models, which combine various forms of data input, are becoming increasingly +popular. However, understanding their internal mechanisms remains a complex +task. Numerous advancements have been made in the field of explainability tools +and mechanisms, yet there is still much to explore. In this work, we present a +novel interactive application aimed towards understanding the internal +mechanisms of large vision-language models. Our interface is designed to +enhance the interpretability of the image patches, which are instrumental in +generating an answer, and assess the efficacy of the language model in +grounding its output in the image. With our application, a user can +systematically investigate the model and uncover system limitations, paving the +way for enhancements in system capabilities. Finally, we present a case study +of how our application can aid in understanding failure mechanisms in a popular +large multi-modal model: LLaVA. + +
+
+
+
+
+ + ☆ Ego-Motion Aware Target Prediction Module for Robust Multi-Object + Tracking IROS2024 + + +
+ Multi-object tracking (MOT) is a prominent task in computer vision with +application in autonomous driving, responsible for the simultaneous tracking of +multiple object trajectories. Detection-based multi-object tracking (DBT) +algorithms detect objects using an independent object detector and predict the +imminent location of each target. Conventional prediction methods in DBT +utilize Kalman Filter(KF) to extrapolate the target location in the upcoming +frames by supposing a constant velocity motion model. These methods are +especially hindered in autonomous driving applications due to dramatic camera +motion or unavailable detections. Such limitations lead to tracking failures +manifested by numerous identity switches and disrupted trajectories. In this +paper, we introduce a novel KF-based prediction module called the Ego-motion +Aware Target Prediction (EMAP) module by focusing on the integration of camera +motion and depth information with object motion models. Our proposed method +decouples the impact of camera rotational and translational velocity from the +object trajectories by reformulating the Kalman Filter. This reformulation +enables us to reject the disturbances caused by camera motion and maximizes the +reliability of the object motion model. We integrate our module with four +state-of-the-art base MOT algorithms, namely OC-SORT, Deep OC-SORT, ByteTrack, +and BoT-SORT. In particular, our evaluation on the KITTI MOT dataset +demonstrates that EMAP remarkably drops the number of identity switches (IDSW) +of OC-SORT and Deep OC-SORT by 73% and 21%, respectively. At the same time, it +elevates other performance metrics such as HOTA by more than 5%. Our source +code is available at https://github.com/noyzzz/EMAP. + +
+
+ comment: 7 pages, 4 figures, submitted to IROS2024 +
+
+
+
+
+ + ☆ Many-to-many Image Generation with Auto-regressive Diffusion Models + + +
+ Recent advancements in image generation have made significant progress, yet +existing models present limitations in perceiving and generating an arbitrary +number of interrelated images within a broad context. This limitation becomes +increasingly critical as the demand for multi-image scenarios, such as +multi-view images and visual narratives, grows with the expansion of multimedia +platforms. This paper introduces a domain-general framework for many-to-many +image generation, capable of producing interrelated image series from a given +set of images, offering a scalable solution that obviates the need for +task-specific solutions across different multi-image scenarios. To facilitate +this, we present MIS, a novel large-scale multi-image dataset, containing 12M +synthetic multi-image samples, each with 25 interconnected images. Utilizing +Stable Diffusion with varied latent noises, our method produces a set of +interconnected images from a single caption. Leveraging MIS, we learn M2M, an +autoregressive model for many-to-many generation, where each image is modeled +within a diffusion framework. Throughout training on the synthetic MIS, the +model excels in capturing style and content from preceding images - synthetic +or real - and generates novel images following the captured patterns. +Furthermore, through task-specific fine-tuning, our model demonstrates its +adaptability to various multi-image generation tasks, including Novel View +Synthesis and Visual Procedure Generation. + +
+
+
+
+
+ + ☆ SalFoM: Dynamic Saliency Prediction with Video Foundation Models + + +
+ Recent advancements in video saliency prediction (VSP) have shown promising +performance compared to the human visual system, whose emulation is the primary +goal of VSP. However, current state-of-the-art models employ spatio-temporal +transformers trained on limited amounts of data, hindering generalizability +adaptation to downstream tasks. The benefits of vision foundation models +present a potential solution to improve the VSP process. However, adapting +image foundation models to the video domain presents significant challenges in +modeling scene dynamics and capturing temporal information. To address these +challenges, and as the first initiative to design a VSP model based on video +foundation models, we introduce SalFoM, a novel encoder-decoder video +transformer architecture. Our model employs UnMasked Teacher (UMT) as feature +extractor and presents a heterogeneous decoder which features a locality-aware +spatio-temporal transformer and integrates local and global spatio-temporal +information from various perspectives to produce the final saliency map. Our +qualitative and quantitative experiments on the challenging VSP benchmark +datasets of DHF1K, Hollywood-2 and UCF-Sports demonstrate the superiority of +our proposed model in comparison with the state-of-the-art methods. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ☆ Behind the Veil: Enhanced Indoor 3D Scene Reconstruction with Occluded + Surfaces Completion + + +
+ In this paper, we present a novel indoor 3D reconstruction method with +occluded surface completion, given a sequence of depth readings. Prior +state-of-the-art (SOTA) methods only focus on the reconstruction of the visible +areas in a scene, neglecting the invisible areas due to the occlusions, e.g., +the contact surface between furniture, occluded wall and floor. Our method +tackles the task of completing the occluded scene surfaces, resulting in a +complete 3D scene mesh. The core idea of our method is learning 3D geometry +prior from various complete scenes to infer the occluded geometry of an unseen +scene from solely depth measurements. We design a coarse-fine hierarchical +octree representation coupled with a dual-decoder architecture, i.e., +Geo-decoder and 3D Inpainter, which jointly reconstructs the complete 3D scene +geometry. The Geo-decoder with detailed representation at fine levels is +optimized online for each scene to reconstruct visible surfaces. The 3D +Inpainter with abstract representation at coarse levels is trained offline +using various scenes to complete occluded surfaces. As a result, while the +Geo-decoder is specialized for an individual scene, the 3D Inpainter can be +generally applied across different scenes. We evaluate the proposed method on +the 3D Completed Room Scene (3D-CRS) and iTHOR datasets, significantly +outperforming the SOTA methods by a gain of 16.8% and 24.2% in terms of the +completeness of 3D reconstruction. 3D-CRS dataset including a complete 3D mesh +of each scene is provided at project webpage. + +
+
+
+
+
+ + ☆ Self-supervised 6-DoF Robot Grasping by Demonstration via Augmented + Reality Teleoperation System + + +
+ Most existing 6-DoF robot grasping solutions depend on strong supervision on +grasp pose to ensure satisfactory performance, which could be laborious and +impractical when the robot works in some restricted area. To this end, we +propose a self-supervised 6-DoF grasp pose detection framework via an Augmented +Reality (AR) teleoperation system that can efficiently learn human +demonstrations and provide 6-DoF grasp poses without grasp pose annotations. +Specifically, the system collects the human demonstration from the AR +environment and contrastively learns the grasping strategy from the +demonstration. For the real-world experiment, the proposed system leads to +satisfactory grasping abilities and learning to grasp unknown objects within +three demonstrations. + +
+
+
+
+
+ + ☆ Linear Anchored Gaussian Mixture Model for Location and Width + Computation of Objects in Thick Line Shape + + +
+ An accurate detection of the centerlines of linear objects is a challenging +topic in many sensitive real-world applications such X-ray imaging, remote +sensing and lane marking detection in road traffic. Model-based approaches +using Hough and Radon transforms are often used but, are not recommended for +thick line detection, whereas approaches based on image derivatives need +further step-by-step processing, making their efficiency dependent on each step +outcomes. In this paper, we aim to detect linear structures found in images by +considering the 3D representation of the image gray levels as a finite mixture +model of statistical distribution. The latter, which we named linear anchored +Gaussian distribution could be parametrized by a scale value {\sigma} +describing the linear structure thickness and a line equation, parametrized, in +turn, by a radius \r{ho} and an orientation angle {\theta}, describing the +linear structure centerline location. Expectation-Maximization (EM) algorithm +is used for the mixture model parameter estimation, where a new paradigm, using +the background subtraction for the likelihood function computation, is +proposed. For the EM algorithm, two {\theta} parameter initialization schemes +are used: the first one is based on a random choice of the first component of +{\theta} vector, whereas the second is based on the image Hessian with a +simultaneous computation of the mixture model components number. Experiments on +real world images and synthetic images corrupted by blur and additive noise +show the good performance of the proposed methods, where the algorithm using +background subtraction and Hessian-based {\theta} initialization provides an +outstanding accuracy of the linear structure detection despite irregular image +background and presence of blur and noise. + +
+
+ comment: 13 pages, 13 figures +
+
+
+
+
+ + ☆ AWOL: Analysis WithOut synthesis using Language + + +
+ Many classical parametric 3D shape models exist, but creating novel shapes +with such models requires expert knowledge of their parameters. For example, +imagine creating a specific type of tree using procedural graphics or a new +kind of animal from a statistical shape model. Our key idea is to leverage +language to control such existing models to produce novel shapes. This involves +learning a mapping between the latent space of a vision-language model and the +parameter space of the 3D model, which we do using a small set of shape and +text pairs. Our hypothesis is that mapping from language to parameters allows +us to generate parameters for objects that were never seen during training. If +the mapping between language and parameters is sufficiently smooth, then +interpolation or generalization in language should translate appropriately into +novel 3D shapes. We test our approach with two very different types of +parametric shape models (quadrupeds and arboreal trees). We use a learned +statistical shape model of quadrupeds and show that we can use text to generate +new animals not present during training. In particular, we demonstrate +state-of-the-art shape estimation of 3D dogs. This work also constitutes the +first language-driven method for generating 3D trees. Finally, embedding images +in the CLIP latent space enables us to generate animals and trees directly from +images. + +
+
+
+
+
+ + ☆ BCAmirs at SemEval-2024 Task 4: Beyond Words: A Multimodal and + Multilingual Exploration of Persuasion in Memes + + +
+ Memes, combining text and images, frequently use metaphors to convey +persuasive messages, shaping public opinion. Motivated by this, our team +engaged in SemEval-2024 Task 4, a hierarchical multi-label classification task +designed to identify rhetorical and psychological persuasion techniques +embedded within memes. To tackle this problem, we introduced a caption +generation step to assess the modality gap and the impact of additional +semantic information from images, which improved our result. Our best model +utilizes GPT-4 generated captions alongside meme text to fine-tune RoBERTa as +the text encoder and CLIP as the image encoder. It outperforms the baseline by +a large margin in all 12 subtasks. In particular, it ranked in top-3 across all +languages in Subtask 2a, and top-4 in Subtask 2b, demonstrating quantitatively +strong performance. The improvement achieved by the introduced intermediate +step is likely attributable to the metaphorical essence of images that +challenges visual encoders. This highlights the potential for improving +abstract visual semantics encoding. + +
+
+ comment: 11 pages, 5 tables, 2 figures, Proceedings of the 18th International + Workshop on Semantic Evaluation (SemEval-2024) @ NAACL 2024 +
+
+
+
+
+ + ☆ DPFT: Dual Perspective Fusion Transformer for Camera-Radar-based Object + Detection + + +
+ The perception of autonomous vehicles has to be efficient, robust, and +cost-effective. However, cameras are not robust against severe weather +conditions, lidar sensors are expensive, and the performance of radar-based +perception is still inferior to the others. Camera-radar fusion methods have +been proposed to address this issue, but these are constrained by the typical +sparsity of radar point clouds and often designed for radars without elevation +information. We propose a novel camera-radar fusion approach called Dual +Perspective Fusion Transformer (DPFT), designed to overcome these limitations. +Our method leverages lower-level radar data (the radar cube) instead of the +processed point clouds to preserve as much information as possible and employs +projections in both the camera and ground planes to effectively use radars with +elevation information and simplify the fusion with camera data. As a result, +DPFT has demonstrated state-of-the-art performance on the K-Radar dataset while +showing remarkable robustness against adverse weather conditions and +maintaining a low inference time. The code is made available as open-source +software under https://github.com/TUMFTM/DPFT. + +
+
+
+
+
+ + ☆ Skeleton Recall Loss for Connectivity Conserving and Resource Efficient + Segmentation of Thin Tubular Structures + + +
+ Accurately segmenting thin tubular structures, such as vessels, nerves, roads +or concrete cracks, is a crucial task in computer vision. Standard deep +learning-based segmentation loss functions, such as Dice or Cross-Entropy, +focus on volumetric overlap, often at the expense of preserving structural +connectivity or topology. This can lead to segmentation errors that adversely +affect downstream tasks, including flow calculation, navigation, and structural +inspection. Although current topology-focused losses mark an improvement, they +introduce significant computational and memory overheads. This is particularly +relevant for 3D data, rendering these losses infeasible for larger volumes as +well as increasingly important multi-class segmentation problems. To mitigate +this, we propose a novel Skeleton Recall Loss, which effectively addresses +these challenges by circumventing intensive GPU-based calculations with +inexpensive CPU operations. It demonstrates overall superior performance to +current state-of-the-art approaches on five public datasets for +topology-preserving segmentation, while substantially reducing computational +overheads by more than 90%. In doing so, we introduce the first multi-class +capable loss function for thin structure segmentation, excelling in both +efficiency and efficacy for topology-preservation. + +
+
+
+
+
+ + ☆ MeshBrush: Painting the Anatomical Mesh with Neural Stylization for + Endoscopy + + +
+ Style transfer is a promising approach to close the sim-to-real gap in +medical endoscopy. Rendering realistic endoscopic videos by traversing +pre-operative scans (such as MRI or CT) can generate realistic simulations as +well as ground truth camera poses and depth maps. Although image-to-image (I2I) +translation models such as CycleGAN perform well, they are unsuitable for +video-to-video synthesis due to the lack of temporal consistency, resulting in +artifacts between frames. We propose MeshBrush, a neural mesh stylization +method to synthesize temporally consistent videos with differentiable +rendering. MeshBrush uses the underlying geometry of patient imaging data while +leveraging existing I2I methods. With learned per-vertex textures, the stylized +mesh guarantees consistency while producing high-fidelity outputs. We +demonstrate that mesh stylization is a promising approach for creating +realistic simulations for downstream tasks such as training and preoperative +planning. Although our method is tested and designed for ureteroscopy, its +components are transferable to general endoscopic and laparoscopic procedures. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ ASAP: Interpretable Analysis and Summarization of AI-generated Image + Patterns at Scale + + +
+ Generative image models have emerged as a promising technology to produce +realistic images. Despite potential benefits, concerns grow about its misuse, +particularly in generating deceptive images that could raise significant +ethical, legal, and societal issues. Consequently, there is growing demand to +empower users to effectively discern and comprehend patterns of AI-generated +images. To this end, we developed ASAP, an interactive visualization system +that automatically extracts distinct patterns of AI-generated images and allows +users to interactively explore them via various views. To uncover fake +patterns, ASAP introduces a novel image encoder, adapted from CLIP, which +transforms images into compact "distilled" representations, enriched with +information for differentiating authentic and fake images. These +representations generate gradients that propagate back to the attention maps of +CLIP's transformer block. This process quantifies the relative importance of +each pixel to image authenticity or fakeness, exposing key deceptive patterns. +ASAP enables the at scale interactive analysis of these patterns through +multiple, coordinated visualizations. This includes a representation overview +with innovative cell glyphs to aid in the exploration and qualitative +evaluation of fake patterns across a vast array of images, as well as a pattern +view that displays authenticity-indicating patterns in images and quantifies +their impact. ASAP supports the analysis of cutting-edge generative models with +the latest architectures, including GAN-based models like proGAN and diffusion +models like the latent diffusion model. We demonstrate ASAP's usefulness +through two usage scenarios using multiple fake image detection benchmark +datasets, revealing its ability to identify and understand hidden patterns in +AI-generated images, especially in detecting fake human faces produced by +diffusion-based techniques. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ Scaling Laws for Galaxy Images + + +
+ We present the first systematic investigation of supervised scaling laws +outside of an ImageNet-like context - on images of galaxies. We use 840k galaxy +images and over 100M annotations by Galaxy Zoo volunteers, comparable in scale +to Imagenet-1K. We find that adding annotated galaxy images provides a power +law improvement in performance across all architectures and all tasks, while +adding trainable parameters is effective only for some (typically more +subjectively challenging) tasks. We then compare the downstream performance of +finetuned models pretrained on either ImageNet-12k alone vs. additionally +pretrained on our galaxy images. We achieve an average relative error rate +reduction of 31% across 5 downstream tasks of scientific interest. Our +finetuned models are more label-efficient and, unlike their +ImageNet-12k-pretrained equivalents, often achieve linear transfer performance +equal to that of end-to-end finetuning. We find relatively modest additional +downstream benefits from scaling model size, implying that scaling alone is not +sufficient to address our domain gap, and suggest that practitioners with +qualitatively different images might benefit more from in-domain adaption +followed by targeted downstream labelling. + +
+
+ comment: 10+6 pages, 12 figures. Appendix C2 based on arxiv:2206.11927. Code, + demos, documentation at https://github.com/mwalmsley/zoobot +
+
+
+
+
+ + ♻ ☆ FreeZe: Training-free zero-shot 6D pose estimation with geometric and + vision foundation models + + +
+ Estimating the 6D pose of objects unseen during training is highly desirable +yet challenging. Zero-shot object 6D pose estimation methods address this +challenge by leveraging additional task-specific supervision provided by +large-scale, photo-realistic synthetic datasets. However, their performance +heavily depends on the quality and diversity of rendered data and they require +extensive training. In this work, we show how to tackle the same task but +without training on specific data. We propose FreeZe, a novel solution that +harnesses the capabilities of pre-trained geometric and vision foundation +models. FreeZe leverages 3D geometric descriptors learned from unrelated 3D +point clouds and 2D visual features learned from web-scale 2D images to +generate discriminative 3D point-level descriptors. We then estimate the 6D +pose of unseen objects by 3D registration based on RANSAC. We also introduce a +novel algorithm to solve ambiguous cases due to geometrically symmetric objects +that is based on visual features. We comprehensively evaluate FreeZe across the +seven core datasets of the BOP Benchmark, which include over a hundred 3D +objects and 20,000 images captured in various scenarios. FreeZe consistently +outperforms all state-of-the-art approaches, including competitors extensively +trained on synthetic 6D pose estimation data. Code will be publicly available +at https://andreacaraffa.github.io/freeze. + +
+
+
+
+
+ + ♻ ☆ Total Selfie: Generating Full-Body Selfies + + +
+ We present a method to generate full-body selfies from photographs originally +taken at arms length. Because self-captured photos are typically taken close +up, they have limited field of view and exaggerated perspective that distorts +facial shapes. We instead seek to generate the photo some one else would take +of you from a few feet away. Our approach takes as input four selfies of your +face and body, a background image, and generates a full-body selfie in a +desired target pose. We introduce a novel diffusion-based approach to combine +all of this information into high-quality, well-composed photos of you with the +desired pose and background. + +
+
+ comment: Project page: + https://homes.cs.washington.edu/~boweiche/project_page/totalselfie/ +
+
+
+
+
+ + ♻ ☆ G3DR: Generative 3D Reconstruction in ImageNet CVPR 2024 + + +
+ We introduce a novel 3D generative method, Generative 3D Reconstruction +(G3DR) in ImageNet, capable of generating diverse and high-quality 3D objects +from single images, addressing the limitations of existing methods. At the +heart of our framework is a novel depth regularization technique that enables +the generation of scenes with high-geometric fidelity. G3DR also leverages a +pretrained language-vision model, such as CLIP, to enable reconstruction in +novel views and improve the visual realism of generations. Additionally, G3DR +designs a simple but effective sampling procedure to further improve the +quality of generations. G3DR offers diverse and efficient 3D asset generation +based on class or text conditioning. Despite its simplicity, G3DR is able to +beat state-of-theart methods, improving over them by up to 22% in perceptual +metrics and 90% in geometry scores, while needing only half of the training +time. Code is available at https://github.com/preddy5/G3DR + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Quantifying and Mitigating Unimodal Biases in Multimodal Large Language + Models: A Causal Perspective + + +
+ Recent advancements in Large Language Models (LLMs) have facilitated the +development of Multimodal LLMs (MLLMs). Despite their impressive capabilities, +MLLMs often suffer from an over-reliance on unimodal biases (e.g., language +bias and vision bias), leading to incorrect answers in complex multimodal +tasks. To investigate this issue, we propose a causal framework to interpret +the biases in Visual Question Answering (VQA) problems. Within our framework, +we devise a causal graph to elucidate the predictions of MLLMs on VQA problems, +and assess the causal effect of biases through an in-depth causal analysis. +Motivated by the causal graph, we introduce a novel MORE dataset, consisting of +12,000 VQA instances. This dataset is designed to challenge MLLMs' abilities, +necessitating multi-hop reasoning and the surmounting of unimodal biases. +Furthermore, we propose two strategies to mitigate unimodal biases and enhance +MLLMs' reasoning capabilities, including a Decompose-Verify-Answer (DeVA) +framework for limited-access MLLMs and the refinement of open-source MLLMs +through fine-tuning. Extensive quantitative and qualitative experiments offer +valuable insights for future research. Our project page is at +https://opencausalab.github.io/MORE. + +
+
+
+
+
+ + ♻ ☆ Learning Object State Changes in Videos: An Open-World Perspective CVPR 2024 + + +
+ Object State Changes (OSCs) are pivotal for video understanding. While humans +can effortlessly generalize OSC understanding from familiar to unknown objects, +current approaches are confined to a closed vocabulary. Addressing this gap, we +introduce a novel open-world formulation for the video OSC problem. The goal is +to temporally localize the three stages of an OSC -- the object's initial +state, its transitioning state, and its end state -- whether or not the object +has been observed during training. Towards this end, we develop VidOSC, a +holistic learning approach that: (1) leverages text and vision-language models +for supervisory signals to obviate manually labeling OSC training data, and (2) +abstracts fine-grained shared state representations from objects to enhance +generalization. Furthermore, we present HowToChange, the first open-world +benchmark for video OSC localization, which offers an order of magnitude +increase in the label space and annotation volume compared to the best existing +benchmark. Experimental results demonstrate the efficacy of our approach, in +both traditional closed-world and open-world scenarios. + +
+
+ comment: Accepted by CVPR 2024, Project website: + https://vision.cs.utexas.edu/projects/VidOSC/ +
+
+
+
+
+ + ♻ ☆ AddSR: Accelerating Diffusion-based Blind Super-Resolution with + Adversarial Diffusion Distillation + + +
+ Blind super-resolution methods based on stable diffusion showcase formidable +generative capabilities in reconstructing clear high-resolution images with +intricate details from low-resolution inputs. However, their practical +applicability is often hampered by poor efficiency, stemming from the +requirement of thousands or hundreds of sampling steps. Inspired by the +efficient text-to-image approach adversarial diffusion distillation (ADD), we +design AddSR to address this issue by incorporating the ideas of both +distillation and ControlNet. Specifically, we first propose a prediction-based +self-refinement strategy to provide high-frequency information in the student +model output with marginal additional time cost. Furthermore, we refine the +training process by employing HR images, rather than LR images, to regulate the +teacher model, providing a more robust constraint for distillation. Second, we +introduce a timestep-adapting loss to address the perception-distortion +imbalance problem introduced by ADD. Extensive experiments demonstrate our +AddSR generates better restoration results, while achieving faster speed than +previous SD-based state-of-the-art models (e.g., 7x faster than SeeSR). + +
+
+
+
+
+ + ♻ ☆ Your Student is Better Than Expected: Adaptive Teacher-Student + Collaboration for Text-Conditional Diffusion Models CVPR2024 + + +
+ Knowledge distillation methods have recently shown to be a promising +direction to speedup the synthesis of large-scale diffusion models by requiring +only a few inference steps. While several powerful distillation methods were +recently proposed, the overall quality of student samples is typically lower +compared to the teacher ones, which hinders their practical usage. In this +work, we investigate the relative quality of samples produced by the teacher +text-to-image diffusion model and its distilled student version. As our main +empirical finding, we discover that a noticeable portion of student samples +exhibit superior fidelity compared to the teacher ones, despite the +"approximate" nature of the student. Based on this finding, we propose an +adaptive collaboration between student and teacher diffusion models for +effective text-to-image synthesis. Specifically, the distilled model produces +the initial sample, and then an oracle decides whether it needs further +improvements with a slow teacher model. Extensive experiments demonstrate that +the designed pipeline surpasses state-of-the-art text-to-image alternatives for +various inference budgets in terms of human preference. Furthermore, the +proposed approach can be naturally used in popular applications such as +text-guided image editing and controllable generation. + +
+
+ comment: CVPR2024 camera ready +
+
+
+
+
+ + ♻ ☆ ElasticLaneNet: An Efficient Geometry-Flexible Approach for Lane + Detection + + +
+ The task of lane detection involves identifying the boundaries of driving +areas in real-time. Recognizing lanes with variable and complex geometric +structures remains a challenge. In this paper, we explore a novel and flexible +way of implicit lanes representation named \textit{Elastic Lane map (ELM)}, and +introduce an efficient physics-informed end-to-end lane detection framework, +namely, ElasticLaneNet (Elastic interaction energy-informed Lane detection +Network). The approach considers predicted lanes as moving zero-contours on the +flexibly shaped \textit{ELM} that are attracted to the ground truth guided by +an elastic interaction energy-loss function (EIE loss). Our framework well +integrates the global information and low-level features. The method performs +well in complex lane scenarios, including those with large curvature, weak +geometry features at intersections, complicated cross lanes, Y-shapes lanes, +dense lanes, etc. We apply our approach on three datasets: SDLane, CULane, and +TuSimple. The results demonstrate exceptional performance of our method, with +the state-of-the-art results on the structurally diverse SDLane, achieving +F1-score of 89.51, Recall rate of 87.50, and Precision of 91.61 with fast +inference speed. + +
+
+
+
+
+ + ♻ Dynamic LiDAR Re-simulation using Compositional Neural Fields + + +
+ We introduce DyNFL, a novel neural field-based approach for high-fidelity +re-simulation of LiDAR scans in dynamic driving scenes. DyNFL processes LiDAR +measurements from dynamic environments, accompanied by bounding boxes of moving +objects, to construct an editable neural field. This field, comprising +separately reconstructed static background and dynamic objects, allows users to +modify viewpoints, adjust object positions, and seamlessly add or remove +objects in the re-simulated scene. A key innovation of our method is the neural +field composition technique, which effectively integrates reconstructed neural +assets from various scenes through a ray drop test, accounting for occlusions +and transparent surfaces. Our evaluation with both synthetic and real-world +environments demonstrates that DyNFL substantially improves dynamic scene LiDAR +simulation, offering a combination of physical fidelity and flexible editing +capabilities. + +
+
+ comment: Project page: https://shengyuh.github.io/dynfl +
+
+
+
+
+ + ♻ ☆ Three Heads Are Better Than One: Complementary Experts for Long-Tailed + Semi-supervised Learning + + +
+ We address the challenging problem of Long-Tailed Semi-Supervised Learning +(LTSSL) where labeled data exhibit imbalanced class distribution and unlabeled +data follow an unknown distribution. Unlike in balanced SSL, the generated +pseudo-labels are skewed towards head classes, intensifying the training bias. +Such a phenomenon is even amplified as more unlabeled data will be mislabeled +as head classes when the class distribution of labeled and unlabeled datasets +are mismatched. To solve this problem, we propose a novel method named +ComPlementary Experts (CPE). Specifically, we train multiple experts to model +various class distributions, each of them yielding high-quality pseudo-labels +within one form of class distribution. Besides, we introduce Classwise Batch +Normalization for CPE to avoid performance degradation caused by feature +distribution mismatch between head and non-head classes. CPE achieves +state-of-the-art performances on CIFAR-10-LT, CIFAR-100-LT, and STL-10-LT +dataset benchmarks. For instance, on CIFAR-10-LT, CPE improves test accuracy by +over 2.22% compared to baselines. Code is available at +https://github.com/machengcheng2016/CPE-LTSSL. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ Elastic Interaction Energy-Informed Real-Time Traffic Scene Perception + + +
+ Urban segmentation and lane detection are two important tasks for traffic +scene perception. Accuracy and fast inference speed of visual perception are +crucial for autonomous driving safety. Fine and complex geometric objects are +the most challenging but important recognition targets in traffic scene, such +as pedestrians, traffic signs and lanes. In this paper, a simple and efficient +topology-aware energy loss function-based network training strategy named +EIEGSeg is proposed. EIEGSeg is designed for multi-class segmentation on +real-time traffic scene perception. To be specific, the convolutional neural +network (CNN) extracts image features and produces multiple outputs, and the +elastic interaction energy loss function (EIEL) drives the predictions moving +toward the ground truth until they are completely overlapped. Our strategy +performs well especially on fine-scale structure, \textit{i.e.} small or +irregularly shaped objects can be identified more accurately, and discontinuity +issues on slender objects can be improved. We quantitatively and qualitatively +analyze our method on three traffic datasets, including urban scene +segmentation data Cityscapes and lane detection data TuSimple and CULane. Our +results demonstrate that EIEGSeg consistently improves the performance, +especially on real-time, lightweight networks that are better suited for +autonomous driving. + +
+
+
+
+
+ + ♻ ☆ Strengthening Multimodal Large Language Model with Bootstrapped + Preference Optimization + + +
+ Multimodal Large Language Models (MLLMs) excel in generating responses based +on visual inputs. However, they often suffer from a bias towards generating +responses similar to their pretraining corpus, overshadowing the importance of +visual information. We treat this bias as a "preference" for pretraining +statistics, which hinders the model's grounding in visual input. To mitigate +this issue, we propose Bootstrapped Preference Optimization (BPO), which +conducts preference learning with datasets containing negative responses +bootstrapped from the model itself. Specifically, we propose the following two +strategies: 1) using distorted image inputs to the MLLM for eliciting responses +that contain signified pretraining bias; 2) leveraging text-based LLM to +explicitly inject erroneous but common elements into the original response. +Those undesirable responses are paired with original annotated responses from +the datasets to construct the preference dataset, which is subsequently +utilized to perform preference learning. Our approach effectively suppresses +pretrained LLM bias, enabling enhanced grounding in visual inputs. Extensive +experimentation demonstrates significant performance improvements across +multiple benchmarks, advancing the state-of-the-art in multimodal +conversational systems. + +
+
+
+
+
+ + ♻ ☆ Isometric Multi-Shape Matching + + +
+ Finding correspondences between shapes is a fundamental problem in computer +vision and graphics, which is relevant for many applications, including 3D +reconstruction, object tracking, and style transfer. The vast majority of +correspondence methods aim to find a solution between pairs of shapes, even if +multiple instances of the same class are available. While isometries are often +studied in shape correspondence problems, they have not been considered +explicitly in the multi-matching setting. This paper closes this gap by +proposing a novel optimisation formulation for isometric multi-shape matching. +We present a suitable optimisation algorithm for solving our formulation and +provide a convergence and complexity analysis. Our algorithm obtains +multi-matchings that are by construction provably cycle-consistent. We +demonstrate the superior performance of our method on various datasets and set +the new state-of-the-art in isometric multi-shape matching. + +
+
+
+
+
+ + ♻ ☆ Semi-supervised Active Learning for Video Action Detection + + +
+ In this work, we focus on label efficient learning for video action +detection. We develop a novel semi-supervised active learning approach which +utilizes both labeled as well as unlabeled data along with informative sample +selection for action detection. Video action detection requires spatio-temporal +localization along with classification, which poses several challenges for both +active learning informative sample selection as well as semi-supervised +learning pseudo label generation. First, we propose NoiseAug, a simple +augmentation strategy which effectively selects informative samples for video +action detection. Next, we propose fft-attention, a novel technique based on +high-pass filtering which enables effective utilization of pseudo label for SSL +in video action detection by emphasizing on relevant activity region within a +video. We evaluate the proposed approach on three different benchmark datasets, +UCF-101-24, JHMDB-21, and Youtube-VOS. First, we demonstrate its effectiveness +on video action detection where the proposed approach outperforms prior works +in semi-supervised and weakly-supervised learning along with several baseline +approaches in both UCF101-24 and JHMDB-21. Next, we also show its effectiveness +on Youtube-VOS for video object segmentation demonstrating its generalization +capability for other dense prediction tasks in videos. The code and models is +publicly available at: +\url{https://github.com/AKASH2907/semi-sup-active-learning}. + +
+
+ comment: AAAI Conference on Artificial Intelligence, Main Technical Track + (AAAI), 2024, Code: https://github.com/AKASH2907/semi-sup-active-learning +
+
+
+
+
+ + ♻ ☆ Text-Driven Image Editing via Learnable Regions CVPR 2024 + + +
+ Language has emerged as a natural interface for image editing. In this paper, +we introduce a method for region-based image editing driven by textual prompts, +without the need for user-provided masks or sketches. Specifically, our +approach leverages an existing pre-trained text-to-image model and introduces a +bounding box generator to identify the editing regions that are aligned with +the textual prompts. We show that this simple approach enables flexible editing +that is compatible with current image generation models, and is able to handle +complex prompts featuring multiple objects, complex sentences, or lengthy +paragraphs. We conduct an extensive user study to compare our method against +state-of-the-art methods. The experiments demonstrate the competitive +performance of our method in manipulating images with high fidelity and realism +that correspond to the provided language descriptions. Our project webpage can +be found at: https://yuanze-lin.me/LearnableRegions_page. + +
+
+ comment: Accepted to CVPR 2024 Project webpage: + https://yuanze-lin.me/LearnableRegions_page +
+
+
+
+
+ + ♻ ☆ SIGMA: Scale-Invariant Global Sparse Shape Matching + + +
+ We propose a novel mixed-integer programming (MIP) formulation for generating +precise sparse correspondences for highly non-rigid shapes. To this end, we +introduce a projected Laplace-Beltrami operator (PLBO) which combines intrinsic +and extrinsic geometric information to measure the deformation quality induced +by predicted correspondences. We integrate the PLBO, together with an +orientation-aware regulariser, into a novel MIP formulation that can be solved +to global optimality for many practical problems. In contrast to previous +methods, our approach is provably invariant to rigid transformations and global +scaling, initialisation-free, has optimality guarantees, and scales to high +resolution meshes with (empirically observed) linear time. We show +state-of-the-art results for sparse non-rigid matching on several challenging +3D datasets, including data with inconsistent meshing, as well as applications +in mesh-to-point-cloud matching. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ Conquering the Communication Constraints to Enable Large Pre-Trained + Models in Federated Learning + + +
+ Federated learning (FL) has emerged as a promising paradigm for enabling the +collaborative training of models without centralized access to the raw data on +local devices. In the typical FL paradigm (e.g., FedAvg), model weights are +sent to and from the server each round to participating clients. Recently, the +use of small pre-trained models has been shown effective in federated learning +optimization and improving convergence. However, recent state-of-the-art +pre-trained models are getting more capable but also have more parameters. In +conventional FL, sharing the enormous model weights can quickly put a massive +communication burden on the system, especially if more capable models are +employed. Can we find a solution to enable those strong and readily-available +pre-trained models in FL to achieve excellent performance while simultaneously +reducing the communication burden? To this end, we investigate the use of +parameter-efficient fine-tuning in federated learning and thus introduce a new +framework: FedPEFT. Specifically, we systemically evaluate the performance of +FedPEFT across a variety of client stability, data distribution, and +differential privacy settings. By only locally tuning and globally sharing a +small portion of the model weights, significant reductions in the total +communication overhead can be achieved while maintaining competitive or even +better performance in a wide range of federated learning scenarios, providing +insight into a new paradigm for practical and effective federated systems. + +
+
+
+
+
+ + ♻ ☆ Towards Seamless Adaptation of Pre-trained Models for Visual Place + Recognition ICLR2024 + + +
+ Recent studies show that vision models pre-trained in generic visual learning +tasks with large-scale data can provide useful feature representations for a +wide range of visual perception problems. However, few attempts have been made +to exploit pre-trained foundation models in visual place recognition (VPR). Due +to the inherent difference in training objectives and data between the tasks of +model pre-training and VPR, how to bridge the gap and fully unleash the +capability of pre-trained models for VPR is still a key issue to address. To +this end, we propose a novel method to realize seamless adaptation of +pre-trained models for VPR. Specifically, to obtain both global and local +features that focus on salient landmarks for discriminating places, we design a +hybrid adaptation method to achieve both global and local adaptation +efficiently, in which only lightweight adapters are tuned without adjusting the +pre-trained model. Besides, to guide effective adaptation, we propose a mutual +nearest neighbor local feature loss, which ensures proper dense local features +are produced for local matching and avoids time-consuming spatial verification +in re-ranking. Experimental results show that our method outperforms the +state-of-the-art methods with less training data and training time, and uses +about only 3% retrieval runtime of the two-stage VPR methods with RANSAC-based +spatial verification. It ranks 1st on the MSLS challenge leaderboard (at the +time of submission). The code is released at +https://github.com/Lu-Feng/SelaVPR. + +
+
+ comment: ICLR2024 +
+
+
+
+
+ + ♻ ☆ NEAT: Distilling 3D Wireframes from Neural Attraction Fields CVPR 2024 + + +
+ This paper studies the problem of structured 3D reconstruction using +wireframes that consist of line segments and junctions, focusing on the +computation of structured boundary geometries of scenes. Instead of leveraging +matching-based solutions from 2D wireframes (or line segments) for 3D wireframe +reconstruction as done in prior arts, we present NEAT, a rendering-distilling +formulation using neural fields to represent 3D line segments with 2D +observations, and bipartite matching for perceiving and distilling of a sparse +set of 3D global junctions. The proposed {NEAT} enjoys the joint optimization +of the neural fields and the global junctions from scratch, using +view-dependent 2D observations without precomputed cross-view feature matching. +Comprehensive experiments on the DTU and BlendedMVS datasets demonstrate our +NEAT's superiority over state-of-the-art alternatives for 3D wireframe +reconstruction. Moreover, the distilled 3D global junctions by NEAT, are a +better initialization than SfM points, for the recently-emerged 3D Gaussian +Splatting for high-fidelity novel view synthesis using about 20 times fewer +initial 3D points. Project page: \url{https://xuenan.net/neat}. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ RadEdit: stress-testing biomedical vision models via diffusion image + editing + + +
+ Biomedical imaging datasets are often small and biased, meaning that +real-world performance of predictive models can be substantially lower than +expected from internal testing. This work proposes using generative image +editing to simulate dataset shifts and diagnose failure modes of biomedical +vision models; this can be used in advance of deployment to assess readiness, +potentially reducing cost and patient harm. Existing editing methods can +produce undesirable changes, with spurious correlations learned due to the +co-occurrence of disease and treatment interventions, limiting practical +applicability. To address this, we train a text-to-image diffusion model on +multiple chest X-ray datasets and introduce a new editing method RadEdit that +uses multiple masks, if present, to constrain changes and ensure consistency in +the edited images. We consider three types of dataset shifts: acquisition +shift, manifestation shift, and population shift, and demonstrate that our +approach can diagnose failures and quantify model robustness without additional +data collection, complementing more qualitative tools for explainable AI. + +
+
+
+
+
+ + ♻ ☆ DriftRec: Adapting diffusion models to blind JPEG restoration + + +
+ In this work, we utilize the high-fidelity generation abilities of diffusion +models to solve blind JPEG restoration at high compression levels. We propose +an elegant modification of the forward stochastic differential equation of +diffusion models to adapt them to this restoration task and name our method +DriftRec. Comparing DriftRec against an $L_2$ regression baseline with the same +network architecture and state-of-the-art techniques for JPEG restoration, we +show that our approach can escape the tendency of other methods to generate +blurry images, and recovers the distribution of clean images significantly more +faithfully. For this, only a dataset of clean/corrupted image pairs and no +knowledge about the corruption operation is required, enabling wider +applicability to other restoration tasks. In contrast to other conditional and +unconditional diffusion models, we utilize the idea that the distributions of +clean and corrupted images are much closer to each other than each is to the +usual Gaussian prior of the reverse process in diffusion models. Our approach +therefore requires only low levels of added noise and needs comparatively few +sampling steps even without further optimizations. We show that DriftRec +naturally generalizes to realistic and difficult scenarios such as unaligned +double JPEG compression and blind restoration of JPEGs found online, without +having encountered such examples during training. + +
+
+ comment: (C) 2024 IEEE. Personal use of this material is permitted. Permission + from IEEE must be obtained for all other uses, in any current or future + media, including reprinting/republishing this material for advertising or + promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ Repurposing Diffusion-Based Image Generators for Monocular Depth + Estimation CVPR 2024 + + +
+ Monocular depth estimation is a fundamental computer vision task. Recovering +3D depth from a single image is geometrically ill-posed and requires scene +understanding, so it is not surprising that the rise of deep learning has led +to a breakthrough. The impressive progress of monocular depth estimators has +mirrored the growth in model capacity, from relatively modest CNNs to large +Transformer architectures. Still, monocular depth estimators tend to struggle +when presented with images with unfamiliar content and layout, since their +knowledge of the visual world is restricted by the data seen during training, +and challenged by zero-shot generalization to new domains. This motivates us to +explore whether the extensive priors captured in recent generative diffusion +models can enable better, more generalizable depth estimation. We introduce +Marigold, a method for affine-invariant monocular depth estimation that is +derived from Stable Diffusion and retains its rich prior knowledge. The +estimator can be fine-tuned in a couple of days on a single GPU using only +synthetic training data. It delivers state-of-the-art performance across a wide +range of datasets, including over 20% performance gains in specific cases. +Project page: https://marigoldmonodepth.github.io. + +
+
+ comment: CVPR 2024 camera ready +
+
+
+
+
+ + ♻ ☆ Learnable Weight Initialization for Volumetric Medical Image + Segmentation + + +
+ Hybrid volumetric medical image segmentation models, combining the advantages +of local convolution and global attention, have recently received considerable +attention. While mainly focusing on architectural modifications, most existing +hybrid approaches still use conventional data-independent weight initialization +schemes which restrict their performance due to ignoring the inherent +volumetric nature of the medical data. To address this issue, we propose a +learnable weight initialization approach that utilizes the available medical +training data to effectively learn the contextual and structural cues via the +proposed self-supervised objectives. Our approach is easy to integrate into any +hybrid model and requires no external training data. Experiments on multi-organ +and lung cancer segmentation tasks demonstrate the effectiveness of our +approach, leading to state-of-the-art segmentation performance. Our proposed +data-dependent initialization approach performs favorably as compared to the +Swin-UNETR model pretrained using large-scale datasets on multi-organ +segmentation task. Our source code and models are available at: +https://github.com/ShahinaKK/LWI-VMS. + +
+
+ comment: Accepted at Elsevier AI in Medicine Journal +
+
+
+
+
+ + ♻ ☆ Evaluating GPT-4 with Vision on Detection of Radiological Findings on + Chest Radiographs + + +
+ The study examines the application of GPT-4V, a multi-modal large language +model equipped with visual recognition, in detecting radiological findings from +a set of 100 chest radiographs and suggests that GPT-4V is currently not ready +for real-world diagnostic usage in interpreting chest radiographs. + +
+
+
+
+
+ + ♻ ☆ ResNet with Integrated Convolutional Block Attention Module for Ship + Classification Using Transfer Learning on Optical Satellite Imagery + + +
+ This study proposes a novel transfer learning framework for effective ship +classification using high-resolution optical remote sensing satellite imagery. +The framework is based on the deep convolutional neural network model ResNet50 +and incorporates the Convolutional Block Attention Module (CBAM) to enhance +performance. CBAM enables the model to attend to salient features in the +images, allowing it to better discriminate between subtle differences between +ships and backgrounds. Furthermore, this study adopts a transfer learning +approach tailored for accurately classifying diverse types of ships by +fine-tuning a pre-trained model for the specific task. Experimental results +demonstrate the efficacy of the proposed framework in ship classification using +optical remote sensing imagery, achieving a high classification accuracy of 94% +across 5 classes, outperforming existing methods. This research holds potential +applications in maritime surveillance and management, illegal fishing +detection, and maritime traffic monitoring. + +
+
+
+
+
+ + ♻ ☆ ReCoRe: Regularized Contrastive Representation Learning of World Model CVPR 2024 + + +
+ While recent model-free Reinforcement Learning (RL) methods have demonstrated +human-level effectiveness in gaming environments, their success in everyday +tasks like visual navigation has been limited, particularly under significant +appearance variations. This limitation arises from (i) poor sample efficiency +and (ii) over-fitting to training scenarios. To address these challenges, we +present a world model that learns invariant features using (i) contrastive +unsupervised learning and (ii) an intervention-invariant regularizer. Learning +an explicit representation of the world dynamics i.e. a world model, improves +sample efficiency while contrastive learning implicitly enforces learning of +invariant features, which improves generalization. However, the na\"ive +integration of contrastive loss to world models is not good enough, as +world-model-based RL methods independently optimize representation learning and +agent policy. To overcome this issue, we propose an intervention-invariant +regularizer in the form of an auxiliary task such as depth prediction, image +denoising, image segmentation, etc., that explicitly enforces invariance to +style interventions. Our method outperforms current state-of-the-art +model-based and model-free RL methods and significantly improves on +out-of-distribution point navigation tasks evaluated on the iGibson benchmark. +With only visual observations, we further demonstrate that our approach +outperforms recent language-guided foundation models for point navigation, +which is essential for deployment on robots with limited computation +capabilities. Finally, we demonstrate that our proposed model excels at the +sim-to-real transfer of its perception module on the Gibson benchmark. + +
+
+ comment: Accepted at CVPR 2024. arXiv admin note: text overlap with + arXiv:2209.14932 +
+
+
+
+
+ + ♻ ☆ AGFSync: Leveraging AI-Generated Feedback for Preference Optimization in + Text-to-Image Generation + + +
+ Text-to-Image (T2I) diffusion models have achieved remarkable success in +image generation. Despite their progress, challenges remain in both +prompt-following ability, image quality and lack of high-quality datasets, +which are essential for refining these models. As acquiring labeled data is +costly, we introduce AGFSync, a framework that enhances T2I diffusion models +through Direct Preference Optimization (DPO) in a fully AI-driven approach. +AGFSync utilizes Vision-Language Models (VLM) to assess image quality across +style, coherence, and aesthetics, generating feedback data within an AI-driven +loop. By applying AGFSync to leading T2I models such as SD v1.4, v1.5, and +SDXL, our extensive experiments on the TIFA dataset demonstrate notable +improvements in VQA scores, aesthetic evaluations, and performance on the HPSv2 +benchmark, consistently outperforming the base models. AGFSync's method of +refining T2I diffusion models paves the way for scalable alignment techniques. + +
+
+
+
+
+ + ♻ ☆ FreeMan: Towards Benchmarking 3D Human Pose Estimation under Real-World + Conditions CVPR2024 + + +
+ Estimating the 3D structure of the human body from natural scenes is a +fundamental aspect of visual perception. 3D human pose estimation is a vital +step in advancing fields like AIGC and human-robot interaction, serving as a +crucial technique for understanding and interacting with human actions in +real-world settings. However, the current datasets, often collected under +single laboratory conditions using complex motion capture equipment and +unvarying backgrounds, are insufficient. The absence of datasets on variable +conditions is stalling the progress of this crucial task. To facilitate the +development of 3D pose estimation, we present FreeMan, the first large-scale, +multi-view dataset collected under the real-world conditions. FreeMan was +captured by synchronizing 8 smartphones across diverse scenarios. It comprises +11M frames from 8000 sequences, viewed from different perspectives. These +sequences cover 40 subjects across 10 different scenarios, each with varying +lighting conditions. We have also established an semi-automated pipeline +containing error detection to reduce the workload of manual check and ensure +precise annotation. We provide comprehensive evaluation baselines for a range +of tasks, underlining the significant challenges posed by FreeMan. Further +evaluations of standard indoor/outdoor human sensing datasets reveal that +FreeMan offers robust representation transferability in real and complex +scenes. Code and data are available at https://wangjiongw.github.io/freeman. + +
+
+ comment: CVPR2024 camera ready version. 19 pages, 16 figures. Project page: + https://wangjiongw.github.io/freeman/ ; API: + https://github.com/wangjiongw/FreeMan_API +
+
+
+
+
+ + ♻ ☆ eWand: A calibration framework for wide baseline frame-based and + event-based camera systems ICRA 2024 + + +
+ Accurate calibration is crucial for using multiple cameras to triangulate the +position of objects precisely. However, it is also a time-consuming process +that needs to be repeated for every displacement of the cameras. The standard +approach is to use a printed pattern with known geometry to estimate the +intrinsic and extrinsic parameters of the cameras. The same idea can be applied +to event-based cameras, though it requires extra work. By using frame +reconstruction from events, a printed pattern can be detected. A blinking +pattern can also be displayed on a screen. Then, the pattern can be directly +detected from the events. Such calibration methods can provide accurate +intrinsic calibration for both frame- and event-based cameras. However, using +2D patterns has several limitations for multi-camera extrinsic calibration, +with cameras possessing highly different points of view and a wide baseline. +The 2D pattern can only be detected from one direction and needs to be of +significant size to compensate for its distance to the camera. This makes the +extrinsic calibration time-consuming and cumbersome. To overcome these +limitations, we propose eWand, a new method that uses blinking LEDs inside +opaque spheres instead of a printed or displayed pattern. Our method provides a +faster, easier-to-use extrinsic calibration approach that maintains high +accuracy for both event- and frame-based cameras. + +
+
+ comment: Accepted for 2024 IEEE International Conference on Robotics and + Automation (ICRA 2024). Project web page: + https://cogsys-tuebingen.github.io/ewand/ +
+
+
+
+
+ + ♻ ☆ Hallucination Benchmark in Medical Visual Question Answering ICLR 2024 + + +
+ The recent success of large language and vision models (LLVMs) on vision +question answering (VQA), particularly their applications in medicine +(Med-VQA), has shown a great potential of realizing effective visual assistants +for healthcare. However, these models are not extensively tested on the +hallucination phenomenon in clinical settings. Here, we created a hallucination +benchmark of medical images paired with question-answer sets and conducted a +comprehensive evaluation of the state-of-the-art models. The study provides an +in-depth analysis of current models' limitations and reveals the effectiveness +of various prompting strategies. + +
+
+ comment: Accepted to ICLR 2024 Tiny Papers(Notable) +
+
+
+
+
+ + ♻ ☆ LLaFS: When Large Language Models Meet Few-Shot Segmentation CVPR2024 + + +
+ This paper proposes LLaFS, the first attempt to leverage large language +models (LLMs) in few-shot segmentation. In contrast to the conventional +few-shot segmentation methods that only rely on the limited and biased +information from the annotated support images, LLaFS leverages the vast prior +knowledge gained by LLM as an effective supplement and directly uses the LLM to +segment images in a few-shot manner. To enable the text-based LLM to handle +image-related tasks, we carefully design an input instruction that allows the +LLM to produce segmentation results represented as polygons, and propose a +region-attribute table to simulate the human visual mechanism and provide +multi-modal guidance. We also synthesize pseudo samples and use curriculum +learning for pretraining to augment data and achieve better optimization. LLaFS +achieves state-of-the-art results on multiple datasets, showing the potential +of using LLMs for few-shot computer vision tasks. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ DETRs Beat YOLOs on Real-time Object Detection + + +
+ The YOLO series has become the most popular framework for real-time object +detection due to its reasonable trade-off between speed and accuracy. However, +we observe that the speed and accuracy of YOLOs are negatively affected by the +NMS. Recently, end-to-end Transformer-based detectors (DETRs) have provided an +alternative to eliminating NMS. Nevertheless, the high computational cost +limits their practicality and hinders them from fully exploiting the advantage +of excluding NMS. In this paper, we propose the Real-Time DEtection TRansformer +(RT-DETR), the first real-time end-to-end object detector to our best knowledge +that addresses the above dilemma. We build RT-DETR in two steps, drawing on the +advanced DETR: first we focus on maintaining accuracy while improving speed, +followed by maintaining speed while improving accuracy. Specifically, we design +an efficient hybrid encoder to expeditiously process multi-scale features by +decoupling intra-scale interaction and cross-scale fusion to improve speed. +Then, we propose the uncertainty-minimal query selection to provide +high-quality initial queries to the decoder, thereby improving accuracy. In +addition, RT-DETR supports flexible speed tuning by adjusting the number of +decoder layers to adapt to various scenarios without retraining. Our +RT-DETR-R50 / R101 achieves 53.1% / 54.3% AP on COCO and 108 / 74 FPS on T4 +GPU, outperforming previously advanced YOLOs in both speed and accuracy. We +also develop scaled RT-DETRs that outperform the lighter YOLO detectors (S and +M models). Furthermore, RT-DETR-R50 outperforms DINO-R50 by 2.2% AP in accuracy +and about 21 times in FPS. After pre-training with Objects365, RT-DETR-R50 / +R101 achieves 55.3% / 56.2% AP. The project page: +https://zhao-yian.github.io/RTDETR. + +
+
+
+
+
+ + ♻ ☆ Creating Ensembles of Classifiers through UMDA for Aerial Scene + Classification + + +
+ Aerial scene classification, which aims to semantically label remote sensing +images in a set of predefined classes (e.g., agricultural, beach, and harbor), +is a very challenging task in remote sensing due to high intra-class +variability and the different scales and orientations of the objects present in +the dataset images. In remote sensing area, the use of CNN architectures as an +alternative solution is also a reality for scene classification tasks. +Generally, these CNNs are used to perform the traditional image classification +task. However, another less used way to classify remote sensing image might be +the one that uses deep metric learning (DML) approaches. In this sense, this +work proposes to employ six DML approaches for aerial scene classification +tasks, analysing their behave with four different pre-trained CNNs as well as +combining them through the use of evolutionary computation algorithm (UMDA). In +performed experiments, it is possible to observe than DML approaches can +achieve the best classification results when compared to traditional +pre-trained CNNs for three well-known remote sensing aerial scene datasets. In +addition, the UMDA algorithm proved to be a promising strategy to combine DML +approaches when there is diversity among them, managing to improve at least +5.6% of accuracy in the classification results using almost 50\% of the +available classifiers for the construction of the final ensemble of +classifiers. + +
+
+ comment: 9 pages, 4 figures, accepted for presentation at the GECCO2024 +
+
+
+
+
+ + ♻ ☆ LYT-Net: Lightweight YUV Transformer-based Network for Low-Light Image + Enhancement + + +
+ In recent years, deep learning-based solutions have proven successful in the +domains of image enhancement. This paper introduces LYT-Net, or Lightweight YUV +Transformer-based Network, as a novel approach for low-light image enhancement. +The proposed architecture, distinct from conventional Retinex-based models, +leverages the YUV color space's natural separation of luminance (Y) and +chrominance (U and V) to simplify the intricate task of disentangling light and +color information in images. By utilizing the strengths of transformers, known +for their capability to capture long-range dependencies, LYT-Net ensures a +comprehensive contextual understanding of the image while maintaining reduced +model complexity. By employing a novel hybrid loss function, our proposed +method achieves state-of-the-art results on low-light image enhancement +datasets, all while being considerably more compact than its counterparts. The +source code and pre-trained models are available at +https://github.com/albrateanu/LYT-Net + +
+
+ comment: 10 pages, 6 figures, submitted to ICIP +
+
+
+
+
+ + ♻ ☆ Mind The Edge: Refining Depth Edges in Sparsely-Supervised Monocular + Depth Estimation CVPR24 + + +
+ Monocular Depth Estimation (MDE) is a fundamental problem in computer vision +with numerous applications. Recently, LIDAR-supervised methods have achieved +remarkable per-pixel depth accuracy in outdoor scenes. However, significant +errors are typically found in the proximity of depth discontinuities, i.e., +depth edges, which often hinder the performance of depth-dependent applications +that are sensitive to such inaccuracies, e.g., novel view synthesis and +augmented reality. Since direct supervision for the location of depth edges is +typically unavailable in sparse LIDAR-based scenes, encouraging the MDE model +to produce correct depth edges is not straightforward. To the best of our +knowledge this paper is the first attempt to address the depth edges issue for +LIDAR-supervised scenes. In this work we propose to learn to detect the +location of depth edges from densely-supervised synthetic data, and use it to +generate supervision for the depth edges in the MDE training. To quantitatively +evaluate our approach, and due to the lack of depth edges GT in LIDAR-based +scenes, we manually annotated subsets of the KITTI and the DDAD datasets with +depth edges ground truth. We demonstrate significant gains in the accuracy of +the depth edges with comparable per-pixel depth accuracy on several challenging +datasets. Code and datasets are available at +\url{https://github.com/liortalker/MindTheEdge}. + +
+
+ comment: Appears in CVPR24' +
+
+
+
+
+ + ♻ ☆ CV-Attention UNet: Attention-based UNet for 3D Cerebrovascular + Segmentation of Enhanced TOF-MRA Images + + +
+ Due to the lack of automated methods, to diagnose cerebrovascular disease, +time-of-flight magnetic resonance angiography (TOF-MRA) is assessed visually, +making it time-consuming. The commonly used encoder-decoder architectures for +cerebrovascular segmentation utilize redundant features, eventually leading to +the extraction of low-level features multiple times. Additionally, +convolutional neural networks (CNNs) suffer from performance degradation when +the batch size is small, and deeper networks experience the vanishing gradient +problem. Methods: In this paper, we attempt to solve these limitations and +propose the 3D cerebrovascular attention UNet method, named CV-AttentionUNet, +for precise extraction of brain vessel images. We proposed a sequence of +preprocessing techniques followed by deeply supervised UNet to improve the +accuracy of segmentation of the brain vessels leading to a stroke. To combine +the low and high semantics, we applied the attention mechanism. This mechanism +focuses on relevant associations and neglects irrelevant anatomical +information. Furthermore, the inclusion of deep supervision incorporates +different levels of features that prove to be beneficial for network +convergence. Results: We demonstrate the efficiency of the proposed method by +cross-validating with an unlabeled dataset, which was further labeled by us. We +believe that the novelty of this algorithm lies in its ability to perform well +on both labeled and unlabeled data with image processing-based enhancement. The +results indicate that our method performed better than the existing +state-of-the-art methods on the TubeTK dataset. Conclusion: The proposed method +will help in accurate segmentation of cerebrovascular structure leading to +stroke + +
+
+
+
+
+ + ♻ ☆ From Isolated Islands to Pangea: Unifying Semantic Space for Human + Action Understanding CVPR 2024 + + +
+ Action understanding has attracted long-term attention. It can be formed as +the mapping from the physical space to the semantic space. Typically, +researchers built datasets according to idiosyncratic choices to define classes +and push the envelope of benchmarks respectively. Datasets are incompatible +with each other like "Isolated Islands" due to semantic gaps and various class +granularities, e.g., do housework in dataset A and wash plate in dataset B. We +argue that we need a more principled semantic space to concentrate the +community efforts and use all datasets together to pursue generalizable action +learning. To this end, we design a structured action semantic space given verb +taxonomy hierarchy and covering massive actions. By aligning the classes of +previous datasets to our semantic space, we gather (image/video/skeleton/MoCap) +datasets into a unified database in a unified label system, i.e., bridging +"isolated islands" into a "Pangea". Accordingly, we propose a novel model +mapping from the physical space to semantic space to fully use Pangea. In +extensive experiments, our new system shows significant superiority, especially +in transfer learning. Our code and data will be made public at +https://mvig-rhos.com/pangea. + +
+
+ comment: CVPR 2024, Project Webpage: https://mvig-rhos.com/pangea +
+
+
+
+
+ + ♻ ☆ RDumb: A simple approach that questions our progress in continual + test-time adaptation + + +
+ Test-Time Adaptation (TTA) allows to update pre-trained models to changing +data distributions at deployment time. While early work tested these algorithms +for individual fixed distribution shifts, recent work proposed and applied +methods for continual adaptation over long timescales. To examine the reported +progress in the field, we propose the Continually Changing Corruptions (CCC) +benchmark to measure asymptotic performance of TTA techniques. We find that +eventually all but one state-of-the-art methods collapse and perform worse than +a non-adapting model, including models specifically proposed to be robust to +performance collapse. In addition, we introduce a simple baseline, "RDumb", +that periodically resets the model to its pretrained state. RDumb performs +better or on par with the previously proposed state-of-the-art in all +considered benchmarks. Our results show that previous TTA approaches are +neither effective at regularizing adaptation to avoid collapse nor able to +outperform a simplistic resetting strategy. + +
+
+
+
+
+ + ♻ ☆ Task-conditioned adaptation of visual features in multi-task policy + learning + + +
+ Successfully addressing a wide variety of tasks is a core ability of +autonomous agents, requiring flexibly adapting the underlying decision-making +strategies and, as we argue in this work, also adapting the perception modules. +An analogical argument would be the human visual system, which uses top-down +signals to focus attention determined by the current task. Similarly, we adapt +pre-trained large vision models conditioned on specific downstream tasks in the +context of multi-task policy learning. We introduce task-conditioned adapters +that do not require finetuning any pre-trained weights, combined with a single +policy trained with behavior cloning and capable of addressing multiple tasks. +We condition the visual adapters on task embeddings, which can be selected at +inference if the task is known, or alternatively inferred from a set of example +demonstrations. To this end, we propose a new optimization-based estimator. We +evaluate the method on a wide variety of tasks from the CortexBench benchmark +and show that, compared to existing work, it can be addressed with a single +policy. In particular, we demonstrate that adapting visual features is a key +design choice and that the method generalizes to unseen tasks given a few +demonstrations. + +
+
+
+
+
+ + ♻ ☆ Robustness Assessment of a Runway Object Classifier for Safe Aircraft + Taxiing + + +
+ As deep neural networks (DNNs) are becoming the prominent solution for many +computational problems, the aviation industry seeks to explore their potential +in alleviating pilot workload and in improving operational safety. However, the +use of DNNs in this type of safety-critical applications requires a thorough +certification process. This need can be addressed through formal verification, +which provides rigorous assurances -- e.g.,~by proving the absence of certain +mispredictions. In this case-study paper, we demonstrate this process using an +image-classifier DNN currently under development at Airbus and intended for use +during the aircraft taxiing phase. We use formal methods to assess this DNN's +robustness to three common image perturbation types: noise, brightness and +contrast, and some of their combinations. This process entails multiple +invocations of the underlying verifier, which might be computationally +expensive; and we therefore propose a method that leverages the monotonicity of +these robustness properties, as well as the results of past verification +queries, in order to reduce the overall number of verification queries required +by nearly 60%. Our results provide an indication of the level of robustness +achieved by the DNN classifier under study, and indicate that it is +considerably more vulnerable to noise than to brightness or contrast +perturbations. + +
+
+ comment: This is a preprint version of the paper in the proceedings of 43rd + Digital Avionics Systems Conference (DASC) +
+
+
+
+
+ + ♻ ☆ Long-term Frame-Event Visual Tracking: Benchmark Dataset and Baseline + + +
+ Current event-/frame-event based trackers undergo evaluation on short-term +tracking datasets, however, the tracking of real-world scenarios involves +long-term tracking, and the performance of existing tracking algorithms in +these scenarios remains unclear. In this paper, we first propose a new +long-term and large-scale frame-event single object tracking dataset, termed +FELT. It contains 742 videos and 1,594,474 RGB frames and event stream pairs +and has become the largest frame-event tracking dataset to date. We re-train +and evaluate 15 baseline trackers on our dataset for future works to compare. +More importantly, we find that the RGB frames and event streams are naturally +incomplete due to the influence of challenging factors and spatially sparse +event flow. In response to this, we propose a novel associative memory +Transformer network as a unified backbone by introducing modern Hopfield layers +into multi-head self-attention blocks to fuse both RGB and event data. +Extensive experiments on RGB-Event (FELT), RGB-Thermal (RGBT234, LasHeR), and +RGB-Depth (DepthTrack) datasets fully validated the effectiveness of our model. +The dataset and source code can be found at +\url{https://github.com/Event-AHU/FELT_SOT_Benchmark}. + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ♻ ☆ Advancing Ante-Hoc Explainable Models through Generative Adversarial + Networks + + +
+ This paper presents a novel concept learning framework for enhancing model +interpretability and performance in visual classification tasks. Our approach +appends an unsupervised explanation generator to the primary classifier network +and makes use of adversarial training. During training, the explanation module +is optimized to extract visual concepts from the classifier's latent +representations, while the GAN-based module aims to discriminate images +generated from concepts, from true images. This joint training scheme enables +the model to implicitly align its internally learned concepts with +human-interpretable visual properties. Comprehensive experiments demonstrate +the robustness of our approach, while producing coherent concept activations. +We analyse the learned concepts, showing their semantic concordance with object +parts and visual attributes. We also study how perturbations in the adversarial +training protocol impact both classification and concept acquisition. In +summary, this work presents a significant step towards building inherently +interpretable deep vision models with task-aligned concept representations - a +key enabler for developing trustworthy AI for real-world perception tasks. + +
+
+ comment: Paper accepted in Human-Centric Representation Learning workshop at + AAAI 2024 (https://hcrl-workshop.github.io/2024/). Paper accepted and + presented at Deployable AI Workshop at AAAI-2024 + (https://sites.google.com/view/dai-2024/home) +
+
+
+
+
+ + ♻ ☆ RAVE: Residual Vector Embedding for CLIP-Guided Backlit Image + Enhancement + + +
+ In this paper we propose a novel modification of Contrastive Language-Image +Pre-Training (CLIP) guidance for the task of unsupervised backlit image +enhancement. Our work builds on the state-of-the-art CLIP-LIT approach, which +learns a prompt pair by constraining the text-image similarity between a prompt +(negative/positive sample) and a corresponding image (backlit image/well-lit +image) in the CLIP embedding space. Learned prompts then guide an image +enhancement network. Based on the CLIP-LIT framework, we propose two novel +methods for CLIP guidance. First, we show that instead of tuning prompts in the +space of text embeddings, it is possible to directly tune their embeddings in +the latent space without any loss in quality. This accelerates training and +potentially enables the use of additional encoders that do not have a text +encoder. Second, we propose a novel approach that does not require any prompt +tuning. Instead, based on CLIP embeddings of backlit and well-lit images from +training data, we compute the residual vector in the embedding space as a +simple difference between the mean embeddings of the well-lit and backlit +images. This vector then guides the enhancement network during training, +pushing a backlit image towards the space of well-lit images. This approach +further dramatically reduces training time, stabilizes training and produces +high quality enhanced images without artifacts, both in supervised and +unsupervised training regimes. Additionally, we show that residual vectors can +be interpreted, revealing biases in training data, and thereby enabling +potential bias correction. + +
+
+
+
+
+ + ♻ ☆ MeciFace: Mechanomyography and Inertial Fusion-based Glasses for Edge + Real-Time Recognition of Facial and Eating Activities + + +
+ The increasing prevalence of stress-related eating behaviors and their impact +on overall health highlights the importance of effective and ubiquitous +monitoring systems. In this paper, we present MeciFace, an innovative wearable +technology designed to monitor facial expressions and eating activities in +real-time on-the-edge (RTE). MeciFace aims to provide a low-power, +privacy-conscious, and highly accurate tool for promoting healthy eating +behaviors and stress management. We employ lightweight convolutional neural +networks as backbone models for facial expression and eating monitoring +scenarios. The MeciFace system ensures efficient data processing with a tiny +memory footprint, ranging from 11KB to 19 KB. During RTE evaluation, the system +achieves an F1-score of < 86% for facial expression recognition and 94% for +eating/drinking monitoring, for the RTE of unseen users (user-independent +case). + +
+
+ comment: Submitted to IEEE Transactions on Consumer Electronics +
+
+
+
+
+ + ♻ ☆ Language Guided Domain Generalized Medical Image Segmentation + + +
+ Single source domain generalization (SDG) holds promise for more reliable and +consistent image segmentation across real-world clinical settings particularly +in the medical domain, where data privacy and acquisition cost constraints +often limit the availability of diverse datasets. Depending solely on visual +features hampers the model's capacity to adapt effectively to various domains, +primarily because of the presence of spurious correlations and domain-specific +characteristics embedded within the image features. Incorporating text features +alongside visual features is a potential solution to enhance the model's +understanding of the data, as it goes beyond pixel-level information to provide +valuable context. Textual cues describing the anatomical structures, their +appearances, and variations across various imaging modalities can guide the +model in domain adaptation, ultimately contributing to more robust and +consistent segmentation. In this paper, we propose an approach that explicitly +leverages textual information by incorporating a contrastive learning mechanism +guided by the text encoder features to learn a more robust feature +representation. We assess the effectiveness of our text-guided contrastive +feature alignment technique in various scenarios, including cross-modality, +cross-sequence, and cross-site settings for different segmentation tasks. Our +approach achieves favorable performance against existing methods in literature. +Our code and model weights are available at +https://github.com/ShahinaKK/LG_SDG.git. + +
+
+ comment: Accepted at ISBI2024 +
+
+
+
+
+ + ♻ ☆ Scaling Up to Excellence: Practicing Model Scaling for Photo-Realistic + Image Restoration In the Wild CVPR 2024 + + +
+ We introduce SUPIR (Scaling-UP Image Restoration), a groundbreaking image +restoration method that harnesses generative prior and the power of model +scaling up. Leveraging multi-modal techniques and advanced generative prior, +SUPIR marks a significant advance in intelligent and realistic image +restoration. As a pivotal catalyst within SUPIR, model scaling dramatically +enhances its capabilities and demonstrates new potential for image restoration. +We collect a dataset comprising 20 million high-resolution, high-quality images +for model training, each enriched with descriptive text annotations. SUPIR +provides the capability to restore images guided by textual prompts, broadening +its application scope and potential. Moreover, we introduce negative-quality +prompts to further improve perceptual quality. We also develop a +restoration-guided sampling method to suppress the fidelity issue encountered +in generative-based restoration. Experiments demonstrate SUPIR's exceptional +restoration effects and its novel capacity to manipulate restoration through +textual prompts. + +
+
+ comment: This paper has been accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Analysis of Video Quality Datasets via Design of Minimalistic Video + Quality Models + + +
+ Blind video quality assessment (BVQA) plays an indispensable role in +monitoring and improving the end-users' viewing experience in various +real-world video-enabled media applications. As an experimental field, the +improvements of BVQA models have been measured primarily on a few human-rated +VQA datasets. Thus, it is crucial to gain a better understanding of existing +VQA datasets in order to properly evaluate the current progress in BVQA. +Towards this goal, we conduct a first-of-its-kind computational analysis of VQA +datasets via designing minimalistic BVQA models. By minimalistic, we restrict +our family of BVQA models to build only upon basic blocks: a video preprocessor +(for aggressive spatiotemporal downsampling), a spatial quality analyzer, an +optional temporal quality analyzer, and a quality regressor, all with the +simplest possible instantiations. By comparing the quality prediction +performance of different model variants on eight VQA datasets with realistic +distortions, we find that nearly all datasets suffer from the easy dataset +problem of varying severity, some of which even admit blind image quality +assessment (BIQA) solutions. We additionally justify our claims by contrasting +our model generalizability on these VQA datasets, and by ablating a dizzying +set of BVQA design choices related to the basic building blocks. Our results +cast doubt on the current progress in BVQA, and meanwhile shed light on good +practices of constructing next-generation VQA datasets and models. + +
+
+
+
+
+ + ♻ ☆ A Robust Ensemble Algorithm for Ischemic Stroke Lesion Segmentation: + Generalizability and Clinical Utility Beyond the ISLES Challenge + + +
+ Diffusion-weighted MRI (DWI) is essential for stroke diagnosis, treatment +decisions, and prognosis. However, image and disease variability hinder the +development of generalizable AI algorithms with clinical value. We address this +gap by presenting a novel ensemble algorithm derived from the 2022 Ischemic +Stroke Lesion Segmentation (ISLES) challenge. ISLES'22 provided 400 patient +scans with ischemic stroke from various medical centers, facilitating the +development of a wide range of cutting-edge segmentation algorithms by the +research community. Through collaboration with leading teams, we combined +top-performing algorithms into an ensemble model that overcomes the limitations +of individual solutions. Our ensemble model achieved superior ischemic lesion +detection and segmentation accuracy on our internal test set compared to +individual algorithms. This accuracy generalized well across diverse image and +disease variables. Furthermore, the model excelled in extracting clinical +biomarkers. Notably, in a Turing-like test, neuroradiologists consistently +preferred the algorithm's segmentations over manual expert efforts, +highlighting increased comprehensiveness and precision. Validation using a +real-world external dataset (N=1686) confirmed the model's generalizability. +The algorithm's outputs also demonstrated strong correlations with clinical +scores (admission NIHSS and 90-day mRS) on par with or exceeding expert-derived +results, underlining its clinical relevance. This study offers two key +findings. First, we present an ensemble algorithm +(https://github.com/Tabrisrei/ISLES22_Ensemble) that detects and segments +ischemic stroke lesions on DWI across diverse scenarios on par with expert +(neuro)radiologists. Second, we show the potential for biomedical challenge +outputs to extend beyond the challenge's initial objectives, demonstrating +their real-world clinical applicability. + +
+
+
+
+
+ + ♻ ☆ Object-level Copy-Move Forgery Image Detection based on Inconsistency + Mining + + +
+ In copy-move tampering operations, perpetrators often employ techniques, such +as blurring, to conceal tampering traces, posing significant challenges to the +detection of object-level targets with intact structures. Focus on these +challenges, this paper proposes an Object-level Copy-Move Forgery Image +Detection based on Inconsistency Mining (IMNet). To obtain complete +object-level targets, we customize prototypes for both the source and tampered +regions and dynamically update them. Additionally, we extract inconsistent +regions between coarse similar regions obtained through self-correlation +calculations and regions composed of prototypes. The detected inconsistent +regions are used as supplements to coarse similar regions to refine pixel-level +detection. We operate experiments on three public datasets which validate the +effectiveness and the robustness of the proposed IMNet. + +
+
+ comment: 4 pages, 2 figures, Accepted to WWW 2024 +
+
+
+
+
+ + ♻ ☆ InfLoRA: Interference-Free Low-Rank Adaptation for Continual Learning CVPR 2024 + + +
+ Continual learning requires the model to learn multiple tasks sequentially. +In continual learning, the model should possess the ability to maintain its +performance on old tasks (stability) and the ability to adapt to new tasks +continuously (plasticity). Recently, parameter-efficient fine-tuning (PEFT), +which involves freezing a pre-trained model and injecting a small number of +learnable parameters to adapt to downstream tasks, has gained increasing +popularity in continual learning. Although existing continual learning methods +based on PEFT have demonstrated superior performance compared to those not +based on PEFT, most of them do not consider how to eliminate the interference +of the new task on the old tasks, which inhibits the model from making a good +trade-off between stability and plasticity. In this work, we propose a new PEFT +method, called interference-free low-rank adaptation (InfLoRA), for continual +learning. InfLoRA injects a small number of parameters to reparameterize the +pre-trained weights and shows that fine-tuning these injected parameters is +equivalent to fine-tuning the pre-trained weights within a subspace. +Furthermore, InfLoRA designs this subspace to eliminate the interference of the +new task on the old tasks, making a good trade-off between stability and +plasticity. Experimental results show that InfLoRA outperforms existing +state-of-the-art continual learning methods on multiple datasets. + +
+
+ comment: Accepted by the 2024 IEEE/CVF Conference on Computer Vision and + Pattern Recognition (CVPR 2024) +
+
+
+
+
+ + ♻ ☆ RRWNet: Recursive Refinement Network for Effective Retinal Artery/Vein + Segmentation and Classification + + +
+ The caliber and configuration of retinal blood vessels serve as important +biomarkers for various diseases and medical conditions. A thorough analysis of +the retinal vasculature requires the segmentation of the blood vessels and +their classification into arteries and veins, typically performed on color +fundus images obtained by retinography. However, manually performing these +tasks is labor-intensive and prone to human error. While several automated +methods have been proposed to address this task, the current state of art faces +challenges due to manifest classification errors affecting the topological +consistency of segmentation maps. In this work, we introduce RRWNet, a novel +end-to-end deep learning framework that addresses this limitation. The +framework consists of a fully convolutional neural network that recursively +refines semantic segmentation maps, correcting manifest classification errors +and thus improving topological consistency. In particular, RRWNet is composed +of two specialized subnetworks: a Base subnetwork that generates base +segmentation maps from the input images, and a Recursive Refinement subnetwork +that iteratively and recursively improves these maps. Evaluation on three +different public datasets demonstrates the state-of-the-art performance of the +proposed method, yielding more topologically consistent segmentation maps with +fewer manifest classification errors than existing approaches. In addition, the +Recursive Refinement module within RRWNet proves effective in post-processing +segmentation maps from other methods, further demonstrating its potential. The +model code, weights, and predictions will be publicly available at +https://github.com/j-morano/rrwnet. + +
+
+
+
+
+ + ♻ ☆ ARS-DETR: Aspect Ratio Sensitive Oriented Object Detection with + Transformer + + +
+ Existing oriented object detection methods commonly use metric AP$_{50}$ to +measure the performance of the model. We argue that AP$_{50}$ is inherently +unsuitable for oriented object detection due to its large tolerance in angle +deviation. Therefore, we advocate using high-precision metric, e.g. AP$_{75}$, +to measure the performance of models. In this paper, we propose an Aspect Ratio +Sensitive Oriented Object Detector with Transformer, termed ARS-DETR, which +exhibits a competitive performance in high-precision oriented object detection. +Specifically, a new angle classification method, calling Aspect Ratio aware +Circle Smooth Label (AR-CSL), is proposed to smooth the angle label in a more +reasonable way and discard the hyperparameter that introduced by previous work +(e.g. CSL). Then, a rotated deformable attention module is designed to rotate +the sampling points with the corresponding angles and eliminate the +misalignment between region features and sampling points. Moreover, a dynamic +weight coefficient according to the aspect ratio is adopted to calculate the +angle loss. Comprehensive experiments on several challenging datasets show that +our method achieves competitive performance on the high-precision oriented +object detection task. + +
+
+ comment: 10 pages, 8 figures, 8 tables, the source code is available at + https://github.com/httle/ARS-DETR +
+
+
+
+
+ + ♻ ☆ Discriminative Sample-Guided and Parameter-Efficient Feature Space + Adaptation for Cross-Domain Few-Shot Learning + + +
+ In this paper, we look at cross-domain few-shot classification which presents +the challenging task of learning new classes in previously unseen domains with +few labelled examples. Existing methods, though somewhat effective, encounter +several limitations, which we alleviate through two significant improvements. +First, we introduce a lightweight parameter-efficient adaptation strategy to +address overfitting associated with fine-tuning a large number of parameters on +small datasets. This strategy employs a linear transformation of pre-trained +features, significantly reducing the trainable parameter count. Second, we +replace the traditional nearest centroid classifier with a discriminative +sample-aware loss function, enhancing the model's sensitivity to the inter- and +intra-class variances within the training set for improved clustering in +feature space. Empirical evaluations on the Meta-Dataset benchmark showcase +that our approach not only improves accuracy up to 7.7\% and 5.3\% on +previously seen and unseen datasets, respectively, but also achieves the above +performance while being at least $\sim3\times$ more parameter-efficient than +existing methods, establishing a new state-of-the-art in cross-domain few-shot +learning. Our code is available at https://github.com/rashindrie/DIPA. + +
+
+ comment: Code is available at this link: https://github.com/rashindrie/DIPA +
+
+
+
+
+ + ♻ ☆ MotionChain: Conversational Motion Controllers via Multimodal Prompts + + +
+ Recent advancements in language models have demonstrated their adeptness in +conducting multi-turn dialogues and retaining conversational context. However, +this proficiency remains largely unexplored in other multimodal generative +models, particularly in human motion models. By integrating multi-turn +conversations in controlling continuous virtual human movements, generative +human motion models can achieve an intuitive and step-by-step process of human +task execution for humanoid robotics, game agents, or other embodied systems. +In this work, we present MotionChain, a conversational human motion controller +to generate continuous and long-term human motion through multimodal prompts. +Specifically, MotionChain consists of multi-modal tokenizers that transform +various data types such as text, image, and motion, into discrete tokens, +coupled with a Vision-Motion-aware Language model. By leveraging large-scale +language, vision-language, and vision-motion data to assist motion-related +generation tasks, MotionChain thus comprehends each instruction in multi-turn +conversation and generates human motions followed by these prompts. Extensive +experiments validate the efficacy of MotionChain, demonstrating +state-of-the-art performance in conversational motion generation, as well as +more intuitive manners of controlling and interacting with virtual humans. + +
+
+ comment: 14 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ The Audio-Visual Conversational Graph: From an Egocentric-Exocentric + Perspective + + +
+ In recent years, the thriving development of research related to egocentric +videos has provided a unique perspective for the study of conversational +interactions, where both visual and audio signals play a crucial role. While +most prior work focus on learning about behaviors that directly involve the +camera wearer, we introduce the Ego-Exocentric Conversational Graph Prediction +problem, marking the first attempt to infer exocentric conversational +interactions from egocentric videos. We propose a unified multi-modal framework +-- Audio-Visual Conversational Attention (AV-CONV), for the joint prediction of +conversation behaviors -- speaking and listening -- for both the camera wearer +as well as all other social partners present in the egocentric video. +Specifically, we adopt the self-attention mechanism to model the +representations across-time, across-subjects, and across-modalities. To +validate our method, we conduct experiments on a challenging egocentric video +dataset that includes multi-speaker and multi-conversation scenarios. Our +results demonstrate the superior performance of our method compared to a series +of baselines. We also present detailed ablation studies to assess the +contribution of each component in our model. Check our project page at +https://vjwq.github.io/AV-CONV/. + +
+
+
+
+
+ + ♻ ☆ LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging CVPR 2024 + + +
+ Human pose and shape (HPS) estimation with lensless imaging is not only +beneficial to privacy protection but also can be used in covert surveillance +scenarios due to the small size and simple structure of this device. However, +this task presents significant challenges due to the inherent ambiguity of the +captured measurements and lacks effective methods for directly estimating human +pose and shape from lensless data. In this paper, we propose the first +end-to-end framework to recover 3D human poses and shapes from lensless +measurements to our knowledge. We specifically design a multi-scale lensless +feature decoder to decode the lensless measurements through the optically +encoded mask for efficient feature extraction. We also propose a double-head +auxiliary supervision mechanism to improve the estimation accuracy of human +limb ends. Besides, we establish a lensless imaging system and verify the +effectiveness of our method on various datasets acquired by our lensless +imaging system. + +
+
+ comment: Accepted to CVPR 2024. More results available at + https://cic.tju.edu.cn/faculty/likun/projects/LPSNet +
+
+
+
+
+ + ♻ ☆ Semantic Human Mesh Reconstruction with Textures CVPR 2024 + + +
+ The field of 3D detailed human mesh reconstruction has made significant +progress in recent years. However, current methods still face challenges when +used in industrial applications due to unstable results, low-quality meshes, +and a lack of UV unwrapping and skinning weights. In this paper, we present +SHERT, a novel pipeline that can reconstruct semantic human meshes with +textures and high-precision details. SHERT applies semantic- and normal-based +sampling between the detailed surface (e.g. mesh and SDF) and the corresponding +SMPL-X model to obtain a partially sampled semantic mesh and then generates the +complete semantic mesh by our specifically designed self-supervised completion +and refinement networks. Using the complete semantic mesh as a basis, we employ +a texture diffusion model to create human textures that are driven by both +images and texts. Our reconstructed meshes have stable UV unwrapping, +high-quality triangle meshes, and consistent semantic information. The given +SMPL-X model provides semantic information and shape priors, allowing SHERT to +perform well even with incorrect and incomplete inputs. The semantic +information also makes it easy to substitute and animate different body parts +such as the face, body, and hands. Quantitative and qualitative experiments +demonstrate that SHERT is capable of producing high-fidelity and robust +semantic meshes that outperform state-of-the-art methods. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://zhanxy.xyz/projects/shert/ +
+
+
+
+
+ + ♻ ☆ Optimizing Diffusion Noise Can Serve As Universal Motion Priors CVPR 2024 + + +
+ We propose Diffusion Noise Optimization (DNO), a new method that effectively +leverages existing motion diffusion models as motion priors for a wide range of +motion-related tasks. Instead of training a task-specific diffusion model for +each new task, DNO operates by optimizing the diffusion latent noise of an +existing pre-trained text-to-motion model. Given the corresponding latent noise +of a human motion, it propagates the gradient from the target criteria defined +on the motion space through the whole denoising process to update the diffusion +latent noise. As a result, DNO supports any use cases where criteria can be +defined as a function of motion. In particular, we show that, for motion +editing and control, DNO outperforms existing methods in both achieving the +objective and preserving the motion content. DNO accommodates a diverse range +of editing modes, including changing trajectory, pose, joint locations, or +avoiding newly added obstacles. In addition, DNO is effective in motion +denoising and completion, producing smooth and realistic motion from noisy and +partial inputs. DNO achieves these results at inference time without the need +for model retraining, offering great versatility for any defined reward or loss +function on the motion representation. + +
+
+ comment: CVPR 2024. Project page: https://korrawe.github.io/dno-project/ +
+
+
+
+
+ + ♻ ☆ Analytic-Splatting: Anti-Aliased 3D Gaussian Splatting via Analytic + Integration + + +
+ The 3D Gaussian Splatting (3DGS) gained its popularity recently by combining +the advantages of both primitive-based and volumetric 3D representations, +resulting in improved quality and efficiency for 3D scene rendering. However, +3DGS is not alias-free, and its rendering at varying resolutions could produce +severe blurring or jaggies. This is because 3DGS treats each pixel as an +isolated, single point rather than as an area, causing insensitivity to changes +in the footprints of pixels. Consequently, this discrete sampling scheme +inevitably results in aliasing, owing to the restricted sampling bandwidth. In +this paper, we derive an analytical solution to address this issue. More +specifically, we use a conditioned logistic function as the analytic +approximation of the cumulative distribution function (CDF) in a +one-dimensional Gaussian signal and calculate the Gaussian integral by +subtracting the CDFs. We then introduce this approximation in the +two-dimensional pixel shading, and present Analytic-Splatting, which +analytically approximates the Gaussian integral within the 2D-pixel window area +to better capture the intensity response of each pixel. Moreover, we use the +approximated response of the pixel window integral area to participate in the +transmittance calculation of volume rendering, making Analytic-Splatting +sensitive to the changes in pixel footprint at different resolutions. +Experiments on various datasets validate that our approach has better +anti-aliasing capability that gives more details and better fidelity. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ♻ ☆ MLLMReID: Multimodal Large Language Model-based Person Re-identification + + +
+ Multimodal large language models (MLLM) have achieved satisfactory results in +many tasks. However, their performance in the task of person re-identification +(ReID) has not been explored to date. This paper will investigate how to adapt +them for the task of ReID. An intuitive idea is to fine-tune MLLM with ReID +image-text datasets, and then use their visual encoder as a backbone for ReID. +However, there still exist two apparent issues: (1) Designing instructions for +ReID, MLLMs may overfit specific instructions, and designing a variety of +instructions will lead to higher costs. (2) Latent image feature vectors from +LLMs are not involved in loss computation. Instructional learning, aligning +image-text features, results in indirect optimization and a learning objective +that inadequately utilizes features, limiting effectiveness in person feature +learning. To address these problems, this paper proposes MLLMReID: Multimodal +Large Language Model-based ReID. Firstly, we proposed Common Instruction, a +simple approach that leverages the essence ability of LLMs to continue writing, +avoiding complex and diverse instruction design. Secondly, we proposed +DirectReID, which effectively employs the latent image feature vectors of +images outputted by LLMs in ReID tasks. The experimental results demonstrate +the superiority of our method. We will open-source the code on GitHub. + +
+
+
+
+
+ + ♻ ☆ 3D Open-Vocabulary Panoptic Segmentation with 2D-3D Vision-Language + Distillation + + +
+ 3D panoptic segmentation is a challenging perception task, especially in +autonomous driving. It aims to predict both semantic and instance annotations +for 3D points in a scene. Although prior 3D panoptic segmentation approaches +have achieved great performance on closed-set benchmarks, generalizing these +approaches to unseen things and unseen stuff categories remains an open +problem. For unseen object categories, 2D open-vocabulary segmentation has +achieved promising results that solely rely on frozen CLIP backbones and +ensembling multiple classification outputs. However, we find that simply +extending these 2D models to 3D does not guarantee good performance due to poor +per-mask classification quality, especially for novel stuff categories. In this +paper, we propose the first method to tackle 3D open-vocabulary panoptic +segmentation. Our model takes advantage of the fusion between learnable LiDAR +features and dense frozen vision CLIP features, using a single classification +head to make predictions for both base and novel classes. To further improve +the classification performance on novel classes and leverage the CLIP model, we +propose two novel loss functions: object-level distillation loss and +voxel-level distillation loss. Our experiments on the nuScenes and +SemanticKITTI datasets show that our method outperforms the strong baseline by +a large margin. + +
+
+
+
+
+ + ♻ ☆ On-Device Training Under 256KB Memory NeurIPS 2022 + + +
+ On-device training enables the model to adapt to new data collected from the +sensors by fine-tuning a pre-trained model. Users can benefit from customized +AI models without having to transfer the data to the cloud, protecting the +privacy. However, the training memory consumption is prohibitive for IoT +devices that have tiny memory resources. We propose an algorithm-system +co-design framework to make on-device training possible with only 256KB of +memory. On-device training faces two unique challenges: (1) the quantized +graphs of neural networks are hard to optimize due to low bit-precision and the +lack of normalization; (2) the limited hardware resource does not allow full +back-propagation. To cope with the optimization difficulty, we propose +Quantization-Aware Scaling to calibrate the gradient scales and stabilize 8-bit +quantized training. To reduce the memory footprint, we propose Sparse Update to +skip the gradient computation of less important layers and sub-tensors. The +algorithm innovation is implemented by a lightweight training system, Tiny +Training Engine, which prunes the backward computation graph to support sparse +updates and offload the runtime auto-differentiation to compile time. Our +framework is the first solution to enable tiny on-device training of +convolutional neural networks under 256KB SRAM and 1MB Flash without auxiliary +memory, using less than 1/1000 of the memory of PyTorch and TensorFlow while +matching the accuracy on tinyML application VWW. Our study enables IoT devices +not only to perform inference but also to continuously adapt to new data for +on-device lifelong learning. A video demo can be found here: +https://youtu.be/0pUFZYdoMY8. + +
+
+ comment: NeurIPS 2022 +
+
+
+
+
+ + ♻ ☆ Burst Super-Resolution with Diffusion Models for Improving Perceptual + Quality + + +
+ While burst LR images are useful for improving the SR image quality compared +with a single LR image, prior SR networks accepting the burst LR images are +trained in a deterministic manner, which is known to produce a blurry SR image. +In addition, it is difficult to perfectly align the burst LR images, making the +SR image more blurry. Since such blurry images are perceptually degraded, we +aim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity +images can be reconstructed by diffusion models. However, prior SR methods +using the diffusion model are not properly optimized for the burst SR task. +Specifically, the reverse process starting from a random sample is not +optimized for image enhancement and restoration methods, including burst SR. In +our proposed method, on the other hand, burst LR features are used to +reconstruct the initial burst SR image that is fed into an intermediate step in +the diffusion model. This reverse process from the intermediate step 1) skips +diffusion steps for reconstructing the global structure of the image and 2) +focuses on steps for refining detailed textures. Our experimental results +demonstrate that our method can improve the scores of the perceptual quality +metrics. Code: https://github.com/placerkyo/BSRD + +
+
+ comment: Accepted to IJCNN 2024 (International Joint Conference on Neural + Networks) +
+
+
+
+
+ + ♻ ☆ MI-NeRF: Learning a Single Face NeRF from Multiple Identities + + +
+ In this work, we introduce a method that learns a single dynamic neural +radiance field (NeRF) from monocular talking face videos of multiple +identities. NeRFs have shown remarkable results in modeling the 4D dynamics and +appearance of human faces. However, they require per-identity optimization. +Although recent approaches have proposed techniques to reduce the training and +rendering time, increasing the number of identities can be expensive. We +introduce MI-NeRF (multi-identity NeRF), a single unified network that models +complex non-rigid facial motion for multiple identities, using only monocular +videos of arbitrary length. The core premise in our method is to learn the +non-linear interactions between identity and non-identity specific information +with a multiplicative module. By training on multiple videos simultaneously, +MI-NeRF not only reduces the total training time compared to standard +single-identity NeRFs, but also demonstrates robustness in synthesizing novel +expressions for any input identity. We present results for both facial +expression transfer and talking face video synthesis. Our method can be further +personalized for a target identity given only a short video. + +
+
+ comment: Project page: https://aggelinacha.github.io/MI-NeRF/ +
+
+
+
+
+ + ♻ ☆ HAC: Hash-grid Assisted Context for 3D Gaussian Splatting Compression + + +
+ 3D Gaussian Splatting (3DGS) has emerged as a promising framework for novel +view synthesis, boasting rapid rendering speed with high fidelity. However, the +substantial Gaussians and their associated attributes necessitate effective +compression techniques. Nevertheless, the sparse and unorganized nature of the +point cloud of Gaussians (or anchors in our paper) presents challenges for +compression. To address this, we make use of the relations between the +unorganized anchors and the structured hash grid, leveraging their mutual +information for context modeling, and propose a Hash-grid Assisted Context +(HAC) framework for highly compact 3DGS representation. Our approach introduces +a binary hash grid to establish continuous spatial consistencies, allowing us +to unveil the inherent spatial relations of anchors through a carefully +designed context model. To facilitate entropy coding, we utilize Gaussian +distributions to accurately estimate the probability of each quantized +attribute, where an adaptive quantization module is proposed to enable +high-precision quantization of these attributes for improved fidelity +restoration. Additionally, we incorporate an adaptive masking strategy to +eliminate invalid Gaussians and anchors. Importantly, our work is the pioneer +to explore context-based compression for 3DGS representation, resulting in a +remarkable size reduction of over $75\times$ compared to vanilla 3DGS, while +simultaneously improving fidelity, and achieving over $11\times$ size reduction +over SOTA 3DGS compression approach Scaffold-GS. Our code is available here: +https://github.com/YihangChen-ee/HAC + +
+
+ comment: Project Page: https://yihangchen-ee.github.io/project_hac/ Code: + https://github.com/YihangChen-ee/HAC +
+
+
+
+
+ + ♻ ☆ Causal Intervention for Subject-Deconfounded Facial Action Unit + Recognition + + +
+ Subject-invariant facial action unit (AU) recognition remains challenging for +the reason that the data distribution varies among subjects. In this paper, we +propose a causal inference framework for subject-invariant facial action unit +recognition. To illustrate the causal effect existing in AU recognition task, +we formulate the causalities among facial images, subjects, latent AU semantic +relations, and estimated AU occurrence probabilities via a structural causal +model. By constructing such a causal diagram, we clarify the causal effect +among variables and propose a plug-in causal intervention module, CIS, to +deconfound the confounder \emph{Subject} in the causal diagram. Extensive +experiments conducted on two commonly used AU benchmark datasets, BP4D and +DISFA, show the effectiveness of our CIS, and the model with CIS inserted, +CISNet, has achieved state-of-the-art performance. + +
+
+ comment: Accepted by AAAI2022 +
+
+
+
+
+ + ♻ ☆ Improved Zero-Shot Classification by Adapting VLMs with Text + Descriptions + + +
+ The zero-shot performance of existing vision-language models (VLMs) such as +CLIP is limited by the availability of large-scale, aligned image and text +datasets in specific domains. In this work, we leverage two complementary +sources of information -- descriptions of categories generated by large +language models (LLMs) and abundant, fine-grained image classification datasets +-- to improve the zero-shot classification performance of VLMs across +fine-grained domains. On the technical side, we develop methods to train VLMs +with this "bag-level" image-text supervision. We find that simply using these +attributes at test-time does not improve performance, but our training +strategy, for example, on the iNaturalist dataset, leads to an average +improvement of 4-5% in zero-shot classification accuracy for novel categories +of birds and flowers. Similar improvements are observed in domains where a +subset of the categories was used to fine-tune the model. By prompting LLMs in +various ways, we generate descriptions that capture visual appearance, habitat, +and geographic regions and pair them with existing attributes such as the +taxonomic structure of the categories. We systematically evaluate their ability +to improve zero-shot categorization in natural domains. Our findings suggest +that geographic priors can be just as effective and are complementary to visual +appearance. Our method also outperforms prior work on prompt-based tuning of +VLMs. We release the benchmark, consisting of 14 datasets at +https://github.com/cvl-umass/AdaptCLIPZS , which will contribute to future +research in zero-shot recognition. + +
+
+
+
+
+ + ♻ ☆ Cooperative Students: Navigating Unsupervised Domain Adaptation in + Nighttime Object Detection + + +
+ Unsupervised Domain Adaptation (UDA) has shown significant advancements in +object detection under well-lit conditions; however, its performance degrades +notably in low-visibility scenarios, especially at night, posing challenges not +only for its adaptability in low signal-to-noise ratio (SNR) conditions but +also for the reliability and efficiency of automated vehicles. To address this +problem, we propose a \textbf{Co}operative \textbf{S}tudents (\textbf{CoS}) +framework that innovatively employs global-local transformations (GLT) and a +proxy-based target consistency (PTC) mechanism to capture the spatial +consistency in day- and night-time scenarios effectively, and thus bridge the +significant domain shift across contexts. Building upon this, we further devise +an adaptive IoU-informed thresholding (AIT) module to gradually avoid +overlooking potential true positives and enrich the latent information in the +target domain. Comprehensive experiments show that CoS essentially enhanced UDA +performance in low-visibility conditions and surpasses current state-of-the-art +techniques, achieving an increase in mAP of 3.0\%, 1.9\%, and 2.5\% on BDD100K, +SHIFT, and ACDC datasets, respectively. Code is available at +https://github.com/jichengyuan/Cooperitive_Students. + +
+
+ comment: Code is available at + https://github.com/jichengyuan/Cooperitive_Students +
+
+
+
+
+ + ♻ ☆ Mirasol3B: A Multimodal Autoregressive model for time-aligned and + contextual modalities CVPR 2024 + + +
+ One of the main challenges of multimodal learning is the need to combine +heterogeneous modalities (e.g., video, audio, text). For example, video and +audio are obtained at much higher rates than text and are roughly aligned in +time. They are often not synchronized with text, which comes as a global +context, e.g., a title, or a description. Furthermore, video and audio inputs +are of much larger volumes, and grow as the video length increases, which +naturally requires more compute dedicated to these modalities and makes +modeling of long-range dependencies harder. + We here decouple the multimodal modeling, dividing it into separate, focused +autoregressive models, processing the inputs according to the characteristics +of the modalities. We propose a multimodal model, called Mirasol3B, consisting +of an autoregressive component for the time-synchronized modalities (audio and +video), and an autoregressive component for the context modalities which are +not necessarily aligned in time but are still sequential. To address the +long-sequences of the video-audio inputs, we propose to further partition the +video and audio sequences in consecutive snippets and autoregressively process +their representations. To that end, we propose a Combiner mechanism, which +models the audio-video information jointly within a timeframe. The Combiner +learns to extract audio and video features from raw spatio-temporal signals, +and then learns to fuse these features producing compact but expressive +representations per snippet. + Our approach achieves the state-of-the-art on well established multimodal +benchmarks, outperforming much larger models. It effectively addresses the high +computational demand of media inputs by both learning compact representations, +controlling the sequence length of the audio-video feature representations, and +modeling their dependencies in time. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ ViTamin: Designing Scalable Vision Models in the Vision-Language Era CVPR 2024 + + +
+ Recent breakthroughs in vision-language models (VLMs) start a new page in the +vision community. The VLMs provide stronger and more generalizable feature +embeddings compared to those from ImageNet-pretrained models, thanks to the +training on the large-scale Internet image-text pairs. However, despite the +amazing achievement from the VLMs, vanilla Vision Transformers (ViTs) remain +the default choice for the image encoder. Although pure transformer proves its +effectiveness in the text encoding area, it remains questionable whether it is +also the case for image encoding, especially considering that various types of +networks are proposed on the ImageNet benchmark, which, unfortunately, are +rarely studied in VLMs. Due to small data/model scale, the original conclusions +of model design on ImageNet can be limited and biased. In this paper, we aim at +building an evaluation protocol of vision models in the vision-language era +under the contrastive language-image pretraining (CLIP) framework. We provide a +comprehensive way to benchmark different vision models, covering their +zero-shot performance and scalability in both model and training data sizes. To +this end, we introduce ViTamin, a new vision models tailored for VLMs. +ViTamin-L significantly outperforms ViT-L by 2.0% ImageNet zero-shot accuracy, +when using the same publicly available DataComp-1B dataset and the same +OpenCLIP training scheme. ViTamin-L presents promising results on 60 diverse +benchmarks, including classification, retrieval, open-vocabulary detection and +segmentation, and large multi-modal models. When further scaling up the model +size, our ViTamin-XL with only 436M parameters attains 82.9% ImageNet zero-shot +accuracy, surpassing 82.0% achieved by EVA-E that has ten times more parameters +(4.4B). + +
+
+ comment: CVPR 2024; https://github.com/Beckschen/ViTamin +
+
+
+
+
+ + ♻ ☆ Generating Images with 3D Annotations Using Diffusion Models ICLR 2024 + + +
+ Diffusion models have emerged as a powerful generative method, capable of +producing stunning photo-realistic images from natural language descriptions. +However, these models lack explicit control over the 3D structure in the +generated images. Consequently, this hinders our ability to obtain detailed 3D +annotations for the generated images or to craft instances with specific poses +and distances. In this paper, we propose 3D Diffusion Style Transfer (3D-DST), +which incorporates 3D geometry control into diffusion models. Our method +exploits ControlNet, which extends diffusion models by using visual prompts in +addition to text prompts. We generate images of the 3D objects taken from 3D +shape repositories (e.g., ShapeNet and Objaverse), render them from a variety +of poses and viewing directions, compute the edge maps of the rendered images, +and use these edge maps as visual prompts to generate realistic images. With +explicit 3D geometry control, we can easily change the 3D structures of the +objects in the generated images and obtain ground-truth 3D annotations +automatically. This allows us to improve a wide range of vision tasks, e.g., +classification and 3D pose estimation, in both in-distribution (ID) and +out-of-distribution (OOD) settings. We demonstrate the effectiveness of our +method through extensive experiments on ImageNet-100/200, ImageNet-R, +PASCAL3D+, ObjectNet3D, and OOD-CV. The results show that our method +significantly outperforms existing methods, e.g., 3.8 percentage points on +ImageNet-100 using DeiT-B. + +
+
+ comment: ICLR 2024 Spotlight. Code: https://ccvl.jhu.edu/3D-DST/ +
+
+
+
+
+ + ♻ ☆ Effective Adapter for Face Recognition in the Wild + + +
+ In this paper, we tackle the challenge of face recognition in the wild, where +images often suffer from low quality and real-world distortions. Traditional +heuristic approaches-either training models directly on these degraded images +or their enhanced counterparts using face restoration techniques-have proven +ineffective, primarily due to the degradation of facial features and the +discrepancy in image domains. To overcome these issues, we propose an effective +adapter for augmenting existing face recognition models trained on high-quality +facial datasets. The key of our adapter is to process both the unrefined and +enhanced images using two similar structures, one fixed and the other +trainable. Such design can confer two benefits. First, the dual-input system +minimizes the domain gap while providing varied perspectives for the face +recognition model, where the enhanced image can be regarded as a complex +non-linear transformation of the original one by the restoration model. Second, +both two similar structures can be initialized by the pre-trained models +without dropping the past knowledge. The extensive experiments in zero-shot +settings show the effectiveness of our method by surpassing baselines of about +3%, 4%, and 7% in three datasets. Our code will be publicly available. + +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 0000000..69f5da7 --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`