Quick detection of too large offset of 1st blob in cluster

veloman-yunkan · kelson42 · commit d2e8f88d51de · 2026-01-10T12:31:00.000+01:00
The value of the offset of 1st blob in cluster is detected to be
corrupted if it suggests that there are more blobs in this cluster than
there are articles in the ZIM file.

Note that a stronger check could be performed if a tighter upper limit
on the count of blobs in the given cluster was known. In particular, a
precise check would be possible if the count of blobs in the cluster was
available, however the only quick source of that information is the value
that has to be checked itself.
diff --git a/src/cluster.cpp b/src/cluster.cpp
@@ -73,31 +73,31 @@ getClusterReader(const Reader& zimReader, offset_t offset, Cluster::Compression*
 
 } // unnamed namespace
 
-  std::shared_ptr<Cluster> Cluster::read(const Reader& zimReader, offset_t clusterOffset)
+  std::shared_ptr<Cluster> Cluster::read(const Reader& zimReader, offset_t clusterOffset, size_t maxBlobCount)
   {
     Compression comp;
     bool extended;
     auto reader = getClusterReader(zimReader, clusterOffset, &comp, &extended);
-    return std::make_shared<Cluster>(std::move(reader), comp, extended);
+    return std::make_shared<Cluster>(std::move(reader), comp, extended, maxBlobCount);
   }
 
-  Cluster::Cluster(std::unique_ptr<IStreamReader> reader_, Compression comp, bool isExtended)
+  Cluster::Cluster(std::unique_ptr<IStreamReader> reader_, Compression comp, bool isExtended, size_t maxBlobCount)
     : compression(comp),
       isExtended(isExtended),
       m_reader(std::move(reader_))
   {
     if (isExtended) {
-      read_header<uint64_t>();
+      read_header<uint64_t>(maxBlobCount);
     } else {
-      read_header<uint32_t>();
+      read_header<uint32_t>(maxBlobCount);
     }
   }
 
   Cluster::~Cluster() = default;
 
   /* This return the number of char read */
   template<typename OFFSET_TYPE>
-  void Cluster::read_header()
+  void Cluster::read_header(size_t maxBlobCount)
   {
     // read first offset, which specifies, how many offsets we need to read
     OFFSET_TYPE offset = m_reader->read<OFFSET_TYPE>();
@@ -112,6 +112,10 @@ getClusterReader(const Reader& zimReader, offset_t offset, Cluster::Compression*
         throw zim::ZimFileFormatError("Error parsing cluster. Offset of the first blob is not properly aligned.");
     }
 
+    if ( n_offset > maxBlobCount + 1 ) {
+        throw zim::ZimFileFormatError("Error parsing cluster. Offset of the first blob is too large.");
+    }
+
     // read offsets
     m_blobOffsets.clear();
     m_blobOffsets.reserve(n_offset);
diff --git a/src/cluster.h b/src/cluster.h
@@ -73,11 +73,11 @@ namespace zim
 
 
       template<typename OFFSET_TYPE>
-      void read_header();
+      void read_header(size_t maxBlobCount);
       const Reader& getReader(blob_index_t n) const;
 
     public:
-      Cluster(std::unique_ptr<IStreamReader> reader, Compression comp, bool isExtended);
+      Cluster(std::unique_ptr<IStreamReader> reader, Compression comp, bool isExtended, size_t maxBlobCount);
       ~Cluster();
       Compression getCompression() const   { return compression; }
       bool isCompressed() const                { return compression != Compression::None; }
@@ -92,7 +92,7 @@ namespace zim
 
       size_t getMemorySize() const;
 
-      static std::shared_ptr<Cluster> read(const Reader& zimReader, offset_t clusterOffset);
+      static std::shared_ptr<Cluster> read(const Reader& zimReader, offset_t clusterOffset, size_t maxBlobCount);
   };
 
   struct ClusterMemorySize {
diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp
@@ -500,11 +500,17 @@ class Grouping
     return entry_index_t(m_articleListByCluster[idx.v]);
   }
 
+  size_t FileImpl::getMaxBlobCountInCluster(cluster_index_t idx) const
+  {
+    return getCountArticles().v;
+  }
+
   ClusterHandle FileImpl::readCluster(cluster_index_t idx) const
   {
     offset_t clusterOffset(getClusterOffset(idx));
     log_debug("read cluster " << idx << " from offset " << clusterOffset);
-    return Cluster::read(*zimReader, clusterOffset);
+    const auto maxBlobCountInCluster = getMaxBlobCountInCluster(idx);
+    return Cluster::read(*zimReader, clusterOffset, maxBlobCountInCluster);
   }
 
   ClusterHandle FileImpl::getCluster(cluster_index_t idx) const
diff --git a/src/fileimpl.h b/src/fileimpl.h
@@ -192,6 +192,7 @@ namespace zim
       offset_type getMimeListEndUpperLimit() const;
       void readMimeTypes();
       void quickCheckForCorruptFile();
+      size_t getMaxBlobCountInCluster(cluster_index_t idx) const;
 
       bool checkChecksum();
       bool checkDirentPtrs();
diff --git a/test/archive.cpp b/test/archive.cpp
@@ -821,6 +821,11 @@ TEST_F(ZimArchive, validate)
      "Error parsing cluster. Offset of the first blob is not properly aligned.\n"
   )
 
+  TEST_BROKEN_ZIM_NAME(
+    "invalid.too_large_offset_of_first_blob_in_cluster.zim",
+     "Error parsing cluster. Offset of the first blob is too large.\n"
+  )
+
   TEST_BROKEN_ZIM_NAME(
     "invalid.offset_in_cluster.zim",
      "Error parsing cluster. Offsets are not ordered.\n"

Original file line number	Diff line number	Diff line change
`@@ -500,11 +500,17 @@ class Grouping`
`500`	`500`	`return entry_index_t(m_articleListByCluster[idx.v]);`
`501`	`501`	`}`
`502`	`502`
	`503`	`+ size_t FileImpl::getMaxBlobCountInCluster(cluster_index_t idx) const`
	`504`	`+ {`
	`505`	`+ return getCountArticles().v;`
	`506`	`+ }`
	`507`	`+`
`503`	`508`	`ClusterHandle FileImpl::readCluster(cluster_index_t idx) const`
`504`	`509`	`{`
`505`	`510`	`offset_t clusterOffset(getClusterOffset(idx));`
`506`	`511`	`log_debug("read cluster " << idx << " from offset " << clusterOffset);`
`507`		`- return Cluster::read(*zimReader, clusterOffset);`
	`512`	`+ const auto maxBlobCountInCluster = getMaxBlobCountInCluster(idx);`
	`513`	`+ return Cluster::read(*zimReader, clusterOffset, maxBlobCountInCluster);`
`508`	`514`	`}`
`509`	`515`
`510`	`516`	`ClusterHandle FileImpl::getCluster(cluster_index_t idx) const`