Skip to content

Commit 0787eb0

Browse files
authored
Merge pull request #958 from openzim/xapian_preload
Preload Xapian database
2 parents 55e7458 + f971c71 commit 0787eb0

File tree

20 files changed

+651
-260
lines changed

20 files changed

+651
-260
lines changed

docs/usage.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ which is always derived from std::exception.
1212

1313
All classes are defined in the namespace zim.
1414
Copying is allowed and tried to make as cheap as possible.
15-
The reading part of the libzim is most of the time thread safe.
16-
Searching and creating part are not. You have to serialize access to the class yourself.
15+
The reading part of the libzim (including search) is most of the time thread safe.
16+
Creating part is not. You have to serialize access to the Creator class yourself.
1717

1818
The main class, which accesses a archive is |Archive|.
1919
It has actually a reference to an implementation, so that copies of the class just references the same file.

include/zim/archive.h

Lines changed: 145 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,75 @@ namespace zim
4242
efficientOrder
4343
};
4444

45+
/**
46+
* Configuration to pass to archive constructors.
47+
*
48+
* Some configuration option specifying how to open a zim archive.
49+
* For now, it is only related to preload data but it may change in the future.
50+
*
51+
* Archive may preload few data to speedup future accessing.
52+
* However, this preload itself can take times.
53+
*
54+
* OpenConfig allow user to define how Archive should preload data.
55+
*/
56+
struct LIBZIM_API OpenConfig {
57+
/**
58+
* Default configuration.
59+
*
60+
* - Dirent ranges is activated.
61+
* - Xapian preloading is activated.
62+
*/
63+
OpenConfig();
64+
65+
/**
66+
* Configure xapian preloading.
67+
*
68+
* This method modify the configuration and return itelf.
69+
*/
70+
OpenConfig& preloadXapianDb(bool load) { m_preloadXapianDb = load; return *this; }
71+
72+
/**
73+
* Configure xapian preloading.
74+
*
75+
* This method create a new configuration with the new value.
76+
*/
77+
OpenConfig preloadXapianDb(bool load) const {
78+
auto other = *this;
79+
other.m_preloadXapianDb = load;
80+
return other;
81+
}
82+
83+
/**
84+
* Configure direntRanges preloading.
85+
*
86+
* libzim will load `nbRanges + 1` dirents to create `nbRanges` dirent ranges.
87+
* This will be used to speedup dirent lookup. This is an extra layer on top of
88+
* classic dirent cache.
89+
*
90+
* This method modify the configuration and return itelf.
91+
*/
92+
OpenConfig& preloadDirentRanges(int nbRanges) { m_preloadDirentRanges = nbRanges; return *this; }
93+
94+
/**
95+
* Configure direntRanges preloading.
96+
*
97+
* libzim will load `nbRanges + 1` dirents to create `nbRanges` dirent ranges.
98+
* This will be used to speedup dirent lookup. This is an extra layer on top of
99+
* classic dirent cache.
100+
*
101+
* This method create a new configuration with the new value.
102+
*/
103+
OpenConfig preloadDirentRanges(int nbRanges) const {
104+
auto other = *this;
105+
other.m_preloadDirentRanges = nbRanges;
106+
return other;
107+
}
108+
109+
bool m_preloadXapianDb;
110+
int m_preloadDirentRanges;
111+
};
112+
113+
45114
/**
46115
* The Archive class to access content in a zim file.
47116
*
@@ -93,6 +162,20 @@ namespace zim
93162
*/
94163
explicit Archive(const std::string& fname);
95164

165+
/** Archive constructor.
166+
*
167+
* Construct an archive from a filename.
168+
* The file is open readonly.
169+
*
170+
* The filename is the "logical" path.
171+
* So if you want to open a split zim file (foo.zimaa, foo.zimab, ...)
172+
* you must pass the `foo.zim` path.
173+
*
174+
* @param fname The filename to the file to open (utf8 encoded)
175+
* @param openConfig The open configuration to use.
176+
*/
177+
Archive(const std::string& fname, OpenConfig openConfig);
178+
96179
#ifndef _WIN32
97180
/** Archive constructor.
98181
*
@@ -106,6 +189,19 @@ namespace zim
106189
*/
107190
explicit Archive(int fd);
108191

192+
/** Archive constructor.
193+
*
194+
* Construct an archive from a file descriptor.
195+
* Fd is used only at Archive creation.
196+
* Ownership of the fd is not taken and it must be closed by caller.
197+
*
198+
* Note: This function is not available under Windows.
199+
*
200+
* @param fd The descriptor of a seekable file representing a ZIM archive
201+
* @param openConfig The open configuration to use.
202+
*/
203+
Archive(int fd, OpenConfig openConfig);
204+
109205
/** Archive constructor.
110206
*
111207
* Construct an archive from a descriptor of a file with an embedded ZIM
@@ -123,6 +219,24 @@ namespace zim
123219
*/
124220
Archive(int fd, offset_type offset, size_type size);
125221

222+
/** Archive constructor.
223+
*
224+
* Construct an archive from a descriptor of a file with an embedded ZIM
225+
* archive inside.
226+
* Fd is used only at Archive creation.
227+
* Ownership of the fd is not taken and it must be closed by caller.
228+
*
229+
* Note: This function is not available under Windows.
230+
*
231+
* @param fd The descriptor of a seekable file with a continuous segment
232+
* representing a complete ZIM archive.
233+
* @param offset The offset of the ZIM archive relative to the beginning
234+
* of the file (rather than the current position associated with fd).
235+
* @param size The size of the ZIM archive.
236+
* @param openConfig The open configuration to use.
237+
*/
238+
Archive(int fd, offset_type offset, size_type size, OpenConfig openConfig);
239+
126240
/** Archive constructor.
127241
*
128242
* Construct an archive from a descriptor of a file with an embedded ZIM
@@ -137,6 +251,21 @@ namespace zim
137251
*/
138252
explicit Archive(FdInput fd);
139253

254+
/** Archive constructor.
255+
*
256+
* Construct an archive from a descriptor of a file with an embedded ZIM
257+
* archive inside.
258+
* Fd is used only at Archive creation.
259+
* Ownership of the fd is not taken and it must be closed by caller.
260+
*
261+
* Note: This function is not available under Windows.
262+
*
263+
* @param fd A FdInput (tuple) containing the fd (int), offset (offset_type) and size (size_type)
264+
* referencing a continuous segment representing a complete ZIM archive.
265+
* @param openConfig The open configuration to use.
266+
*/
267+
Archive(FdInput fd, OpenConfig openConfig);
268+
140269
/** Archive constructor.
141270
*
142271
* Construct an archive from several file descriptors.
@@ -151,6 +280,22 @@ namespace zim
151280
* referencing a series of segments representing a complete ZIM archive.
152281
*/
153282
explicit Archive(const std::vector<FdInput>& fds);
283+
284+
/** Archive constructor.
285+
*
286+
* Construct an archive from several file descriptors.
287+
* Each part may be embedded in a file.
288+
* Fds are used only at Archive creation.
289+
* Ownership of the fds is not taken and they must be closed by caller.
290+
* Fds (int) can be the same between FdInput if the parts belong to the same file.
291+
*
292+
* Note: This function is not available under Windows.
293+
*
294+
* @param fds A vector of FdInput (tuple) containing the fd (int), offset (offset_type) and size (size_type)
295+
* referencing a series of segments representing a complete ZIM archive.
296+
* @param openConfig The open configuration to use.
297+
*/
298+
Archive(const std::vector<FdInput>& fds, OpenConfig openConfig);
154299
#endif
155300

156301
/** Return the filename of the zim file.
@@ -576,28 +721,6 @@ namespace zim
576721
*/
577722
void setDirentCacheMaxSize(size_t nbDirents);
578723

579-
/** Get the size of the dirent lookup cache.
580-
*
581-
* The returned size returns the default size or the last set size.
582-
* This may not correspond to the actual size of the dirent lookup cache.
583-
* See `set_dirent_lookup_cache_max_size` for more information.
584-
*
585-
* @return The maximum number of sub ranges created in the lookup cache.
586-
*/
587-
size_t getDirentLookupCacheMaxSize() const;
588-
589-
/** Set the size of the dirent lookup cache.
590-
*
591-
* Contrary to other `set_<foo>_cache_max_size`, this method is useless once
592-
* the lookup cache is created.
593-
* The lookup cache is created at first access to a entry in the archive.
594-
* So this method must be called before any access to content (including metadata).
595-
* It is best to call this method first, just after the archive creation.
596-
*
597-
* @param nbRanges The maximum number of sub ranges created in the lookup cache.
598-
*/
599-
void setDirentLookupCacheMaxSize(size_t nbRanges);
600-
601724
#ifdef ZIM_PRIVATE
602725
cluster_index_type getClusterCount() const;
603726
offset_type getClusterOffset(cluster_index_type idx) const;

include/zim/entry.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ namespace zim
3939
class LIBZIM_API Entry
4040
{
4141
public:
42-
explicit Entry(std::shared_ptr<FileImpl> file_, entry_index_type idx_);
42+
explicit Entry(std::shared_ptr<const FileImpl> file_, entry_index_type idx_);
4343

4444
bool isRedirect() const;
4545
std::string getTitle() const;
@@ -84,7 +84,7 @@ namespace zim
8484
entry_index_type getIndex() const { return m_idx; }
8585

8686
protected: // so that Item can be implemented as a wrapper over Entry
87-
std::shared_ptr<FileImpl> m_file;
87+
std::shared_ptr<const FileImpl> m_file;
8888
entry_index_type m_idx;
8989
std::shared_ptr<const Dirent> m_dirent;
9090
};

include/zim/item.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,6 @@ namespace zim
3838
*/
3939
class LIBZIM_API Item : private Entry
4040
{
41-
public: // types
42-
typedef std::pair<std::string, offset_type> DirectAccessInfo;
43-
4441
public: // functions
4542
std::string getTitle() const { return Entry::getTitle(); }
4643
std::string getPath() const { return Entry::getPath(); }
@@ -84,7 +81,7 @@ namespace zim
8481
* If it is not possible to have direct access for this item,
8582
* return a pair of `{"", 0}`
8683
*/
87-
DirectAccessInfo getDirectAccessInformation() const;
84+
zim::ItemDataDirectAccessInfo getDirectAccessInformation() const;
8885

8986
entry_index_type getIndex() const { return Entry::getIndex(); }
9087

include/zim/search.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
#include "archive.h"
2727
#include <vector>
2828
#include <string>
29-
#include <map>
3029

3130
namespace Xapian {
3231
class Enquire;
@@ -48,10 +47,8 @@ class SearchResultSet;
4847
* A Searcher is mainly used to create new `Search`
4948
* Internaly, this is mainly a wrapper around a Xapian database.
5049
*
51-
* You should consider that all search operations are NOT threadsafe.
52-
* It is up to you to protect your calls to avoid race competition.
53-
* However, Searcher (and subsequent classes) do not maintain a global/share state.
54-
* You can create several Searchers and use them in different threads.
50+
* All search (at exception of SearchIterator) operation are thread safe.
51+
* You can freely create several Search from one Searcher and use them in different threads.
5552
*/
5653
class LIBZIM_API Searcher
5754
{

include/zim/search_iterator.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
#include <memory>
2626
#include <iterator>
2727
#include "entry.h"
28-
#include "archive.h"
2928
#include "uuid.h"
3029

3130
namespace zim
@@ -35,6 +34,12 @@ class SearchResultSet;
3534
/**
3635
* A interator on search result (an Entry)
3736
*
37+
* SearchIterator are mostly thread safe:
38+
* - Manipulating the iterator itself (increment it, ...) is not thread safe.
39+
* You should not share an iterator between different thread (and you probably don't have use case for that)
40+
* - Reading from two iterators (getPath, ...) from two differents thread is ok.
41+
* (ie: You can pass iterator from one thread to the other one)
42+
*
3843
* Be aware that the referenced/pointed Entry is generated and stored
3944
* in the iterator itself.
4045
* Once the iterator is destructed or incremented/decremented, you must NOT

include/zim/zim.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#define ZIM_ZIM_H
2424

2525
#include <cstdint>
26+
#include <string>
2627

2728
#ifdef __GNUC__
2829
#define DEPRECATED __attribute__((deprecated))
@@ -135,6 +136,43 @@ namespace zim
135136
*/
136137
COUNT
137138
};
139+
140+
/**
141+
* Information needed to directly access to an item data, bypassing libzim library.
142+
*
143+
* Some items may have their data store uncompressed in the zim archive.
144+
* In such case, an user can read the item data directly by (re)opening the file and
145+
* seek at the right offset.
146+
*/
147+
struct ItemDataDirectAccessInfo {
148+
149+
/**
150+
* The filename to open.
151+
*/
152+
std::string filename;
153+
154+
/**
155+
* The offset to seek to before reading.
156+
*/
157+
offset_type offset;
158+
159+
explicit ItemDataDirectAccessInfo()
160+
: filename(),
161+
offset()
162+
{}
163+
164+
ItemDataDirectAccessInfo(const std::string& filename, offset_type offset)
165+
: filename(filename),
166+
offset(offset)
167+
{}
168+
169+
/**
170+
* Return if the ItemDataDirectAccessInfo is valid
171+
*/
172+
bool isValid() const {
173+
return !filename.empty();
174+
}
175+
};
138176
}
139177

140178
#endif // ZIM_ZIM_H

0 commit comments

Comments
 (0)