openzim · kelson42 · Jan 10, 2026 · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/src/cluster.cpp b/src/cluster.cpp
@@ -73,37 +73,49 @@ getClusterReader(const Reader& zimReader, offset_t offset, Cluster::Compression*
 
 } // unnamed namespace
 
-  std::shared_ptr<Cluster> Cluster::read(const Reader& zimReader, offset_t clusterOffset)
+  std::shared_ptr<Cluster> Cluster::read(const Reader& zimReader, offset_t clusterOffset, size_t maxBlobCount)
   {
     Compression comp;
     bool extended;
     auto reader = getClusterReader(zimReader, clusterOffset, &comp, &extended);
-    return std::make_shared<Cluster>(std::move(reader), comp, extended);
+    return std::make_shared<Cluster>(std::move(reader), comp, extended, maxBlobCount);
   }
 
-  Cluster::Cluster(std::unique_ptr<IStreamReader> reader_, Compression comp, bool isExtended)
+  Cluster::Cluster(std::unique_ptr<IStreamReader> reader_, Compression comp, bool isExtended, size_t maxBlobCount)
     : compression(comp),
       isExtended(isExtended),
       m_reader(std::move(reader_))
   {
     if (isExtended) {
-      read_header<uint64_t>();
+      read_header<uint64_t>(maxBlobCount);
     } else {
-      read_header<uint32_t>();
+      read_header<uint32_t>(maxBlobCount);
     }
   }
 
   Cluster::~Cluster() = default;
 
   /* This return the number of char read */
   template<typename OFFSET_TYPE>
-  void Cluster::read_header()
+  void Cluster::read_header(size_t maxBlobCount)
   {
     // read first offset, which specifies, how many offsets we need to read
     OFFSET_TYPE offset = m_reader->read<OFFSET_TYPE>();
 
+    if ( offset < 2 * sizeof(OFFSET_TYPE) ) {
+        throw zim::ZimFileFormatError("Error parsing cluster. Offset of the first blob is too small.");
+    }
+
     size_t n_offset = offset / sizeof(OFFSET_TYPE);
 
+    if ( n_offset * sizeof(OFFSET_TYPE) != offset ) {
+        throw zim::ZimFileFormatError("Error parsing cluster. Offset of the first blob is not properly aligned.");
+    }
+
+    if ( n_offset > maxBlobCount + 1 ) {
+        throw zim::ZimFileFormatError("Error parsing cluster. Offset of the first blob is too large.");
+    }
+
     // read offsets
     m_blobOffsets.clear();
     m_blobOffsets.reserve(n_offset);

diff --git a/src/cluster.h b/src/cluster.h
@@ -73,11 +73,11 @@ namespace zim
 
 
       template<typename OFFSET_TYPE>
-      void read_header();
+      void read_header(size_t maxBlobCount);
       const Reader& getReader(blob_index_t n) const;
 
     public:
-      Cluster(std::unique_ptr<IStreamReader> reader, Compression comp, bool isExtended);
+      Cluster(std::unique_ptr<IStreamReader> reader, Compression comp, bool isExtended, size_t maxBlobCount);
       ~Cluster();
       Compression getCompression() const   { return compression; }
       bool isCompressed() const                { return compression != Compression::None; }
@@ -92,7 +92,7 @@ namespace zim
 
       size_t getMemorySize() const;
 
-      static std::shared_ptr<Cluster> read(const Reader& zimReader, offset_t clusterOffset);
+      static std::shared_ptr<Cluster> read(const Reader& zimReader, offset_t clusterOffset, size_t maxBlobCount);
   };
 
   struct ClusterMemorySize {

diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp
@@ -500,11 +500,17 @@ class Grouping
     return entry_index_t(m_articleListByCluster[idx.v]);
   }
 
+  size_t FileImpl::getMaxBlobCountInCluster(cluster_index_t idx) const
+  {
+    return getCountArticles().v;
+  }
+
   ClusterHandle FileImpl::readCluster(cluster_index_t idx) const
   {
     offset_t clusterOffset(getClusterOffset(idx));
     log_debug("read cluster " << idx << " from offset " << clusterOffset);
-    return Cluster::read(*zimReader, clusterOffset);
+    const auto maxBlobCountInCluster = getMaxBlobCountInCluster(idx);
+    return Cluster::read(*zimReader, clusterOffset, maxBlobCountInCluster);
   }
 
   ClusterHandle FileImpl::getCluster(cluster_index_t idx) const

diff --git a/src/fileimpl.h b/src/fileimpl.h
@@ -192,6 +192,7 @@ namespace zim
       offset_type getMimeListEndUpperLimit() const;
       void readMimeTypes();
       void quickCheckForCorruptFile();
+      size_t getMaxBlobCountInCluster(cluster_index_t idx) const;
 
       bool checkChecksum();
       bool checkDirentPtrs();

diff --git a/test/archive.cpp b/test/archive.cpp
@@ -791,6 +791,41 @@ TEST_F(ZimArchive, validate)
     "Invalid cluster pointer\n"
   );
 
+  TEST_BROKEN_ZIM_NAME(
+    "invalid.too_small_offset_of_first_blob_in_cluster_0.zim",
+     "Error parsing cluster. Offset of the first blob is too small.\n"
+  )
+
+  TEST_BROKEN_ZIM_NAME(
+    "invalid.too_small_offset_of_first_blob_in_cluster_4.zim",
+     "Error parsing cluster. Offset of the first blob is too small.\n"
+  )
+
+  TEST_BROKEN_ZIM_NAME(
+    "invalid.too_small_offset_of_first_blob_in_cluster_7.zim",
+     "Error parsing cluster. Offset of the first blob is too small.\n"
+  )
+
+  TEST_BROKEN_ZIM_NAME(
+    "invalid.misaligned_offset_of_first_blob_in_cluster_9.zim",
+     "Error parsing cluster. Offset of the first blob is not properly aligned.\n"
+  )
+
+  TEST_BROKEN_ZIM_NAME(
+    "invalid.misaligned_offset_of_first_blob_in_cluster_10.zim",
+     "Error parsing cluster. Offset of the first blob is not properly aligned.\n"
+  )
+
+  TEST_BROKEN_ZIM_NAME(
+    "invalid.misaligned_offset_of_first_blob_in_cluster_11.zim",
+     "Error parsing cluster. Offset of the first blob is not properly aligned.\n"
+  )
+
+  TEST_BROKEN_ZIM_NAME(
+    "invalid.too_large_offset_of_first_blob_in_cluster.zim",
+     "Error parsing cluster. Offset of the first blob is too large.\n"
+  )
+
   TEST_BROKEN_ZIM_NAME(
     "invalid.offset_in_cluster.zim",
      "Error parsing cluster. Offsets are not ordered.\n"