You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
/** The number of (re)tries for loading the RocksDB JNI library. */// Rocksdb loadLib的重试次数privatestaticfinalintROCKSDB_LIB_LOADING_ATTEMPTS = 3;
/** Flag whether the native library has been loaded. */// 标识rocksdb是否已经初始化,保证一个jvm内rocksdb只会初始化依次privatestaticbooleanrocksDbInitialized = false;
// 默认用于传输(下载和上传)文件的线程数(每个有状态操作符)privatestaticfinalintUNDEFINED_NUMBER_OF_TRANSFER_THREADS = -1;
// 默认write batch的大小 -1标识模式privatestaticfinallongUNDEFINED_WRITE_BATCH_SIZE = -1;
// ------------------------------------------------------------------------// -- configuration values, set in the application / configuration/** * Base paths for RocksDB directory, as configured. Null if not yet set, in which case the * configuration values will be used. The configuration defaults to the TaskManager's temp * directories. * RocksDB 目录的基本路径,如配置。 如果尚未设置,则为 Null, * 在这种情况下,将使用配置值。 配置默认为 TaskManager 的临时目录。 */@NullableprivateFile[] localRocksDbDirectories;
/** The pre-configured option settings. */// rocksdb的预先 option配置,包含dboption和columnFamilyOption的配置@NullableprivatePredefinedOptionspredefinedOptions;
/** The options factory to create the RocksDB options in the cluster. */// 集群中的rocksdb的db配置和列族配置,包含compaction、操作db线程数等@NullableprivateRocksDBOptionsFactoryrocksDbOptionsFactory;
/** This determines if incremental checkpointing is enabled. */privatefinalTernaryBooleanenableIncrementalCheckpointing;
/** Thread number used to transfer (download and upload) state, default value: 1. */// rocksdb文件传输线程数privateintnumberOfTransferThreads;
/** The configuration for memory settings (pool sizes, etc.). */// rocksdb内存配置包含manageMemory state、固定内存privatefinalRocksDBMemoryConfigurationmemoryConfiguration;
/** This determines the type of priority queue state. */// 决定timer服务存储使用的实现,是ROCKSDB还是HEAP,HEAP会存在OOM@NullableprivateEmbeddedRocksDBStateBackend.PriorityQueueStateTypepriorityQueueStateType;
/** The default rocksdb metrics options. */privatefinalRocksDBNativeMetricOptionsdefaultMetricOptions;
// -- runtime values, set on TaskManager when initializing / using the backend/** Base paths for RocksDB directory, as initialized. */// 初始化rocksdb的默认pathprivatetransientFile[] initializedDbBasePaths;
/** JobID for uniquifying backup paths. */privatetransientJobIDjobId;
/** The index of the next directory to be used from {@link #initializedDbBasePaths}. */privatetransientintnextDirectory;
/** Whether we already lazily initialized our local storage directories. */privatetransientbooleanisInitialized;
/** * Max consumed memory size for one batch in {@link RocksDBWriteBatchWrapper}, default value * 2mb. */privatelongwriteBatchSize;
privateabstractclassRocksDBMapIterator<T> implementsIterator<T> {
privatestaticfinalintCACHE_SIZE_LIMIT = 128;
/** The db where data resides. */privatefinalRocksDBdb;
/** * The prefix bytes of the key being accessed. All entries under the same key have the same * prefix, hence we can stop iterating once coming across an entry with a different prefix. */@Nonnullprivatefinalbyte[] keyPrefixBytes;
/** * True if all entries have been accessed or the iterator has come across an entry with a * different prefix. */privatebooleanexpired = false;
/** A in-memory cache for the entries in the rocksdb. */privateArrayList<RocksDBMapEntry> cacheEntries = newArrayList<>();
/** * The entry pointing to the current position which is last returned by calling {@link * #nextEntry()}. */privateRocksDBMapEntrycurrentEntry;
privateintcacheIndex = 0;
privatefinalTypeSerializer<UK> keySerializer;
privatefinalTypeSerializer<UV> valueSerializer;
privatefinalDataInputDeserializerdataInputView;
RocksDBMapIterator(
finalRocksDBdb,
finalbyte[] keyPrefixBytes,
finalTypeSerializer<UK> keySerializer,
finalTypeSerializer<UV> valueSerializer,
DataInputDeserializerdataInputView) {
this.db = db;
this.keyPrefixBytes = keyPrefixBytes;
this.keySerializer = keySerializer;
this.valueSerializer = valueSerializer;
this.dataInputView = dataInputView;
}
@OverridepublicbooleanhasNext() {
// 加载cacheloadCache();
return (cacheIndex < cacheEntries.size());
}
@Overridepublicvoidremove() {
if (currentEntry == null || currentEntry.deleted) {
thrownewIllegalStateException(
"The remove operation must be called after a valid next operation.");
}
currentEntry.remove();
}
finalRocksDBMapEntrynextEntry() {
loadCache();
if (cacheIndex == cacheEntries.size()) {
if (!expired) {
thrownewIllegalStateException();
}
returnnull;
}
// 移动指正this.currentEntry = cacheEntries.get(cacheIndex);
cacheIndex++;
returncurrentEntry;
}
privatevoidloadCache() {
if (cacheIndex > cacheEntries.size()) {
thrownewIllegalStateException();
}
// Load cache entries only when the cache is empty and there still exist unread entries// 不满足条件if (cacheIndex < cacheEntries.size() || expired) {
return;
}
// use try-with-resources to ensure RocksIterator can be release even some runtime// exception// occurred in the below code block.// 创建iteatortry (RocksIteratorWrapperiterator =
RocksDBOperationUtils.getRocksIterator(
db, columnFamily, backend.getReadOptions())) {
/* * The iteration starts from the prefix bytes at the first loading. After #nextEntry() is called, * the currentEntry points to the last returned entry, and at that time, we will start * the iterating from currentEntry if reloading cache is needed. */byte[] startBytes =
(currentEntry == null ? keyPrefixBytes : currentEntry.rawKeyBytes);
cacheEntries.clear();
cacheIndex = 0;
// 设置迭代器起点iterator.seek(startBytes);
/* * If the entry pointing to the current position is not removed, it will be the first entry in the * new iterating. Skip it to avoid redundant access in such cases. */if (currentEntry != null && !currentEntry.deleted) {
iterator.next();
}
while (true) {
// 如果迭代器不可用或者key前缀字节长度校验if (!iterator.isValid()
|| !startWithKeyPrefix(keyPrefixBytes, iterator.key())) {
expired = true;
break;
}
if (cacheEntries.size() >= CACHE_SIZE_LIMIT) {
break;
}
RocksDBMapEntryentry =
newRocksDBMapEntry(
db,
keyPrefixBytes.length,
iterator.key(),
iterator.value(),
keySerializer,
valueSerializer,
dataInputView);
// 放入cachecacheEntries.add(entry);
iterator.next();
}
}
}
}
RocksDBValueState
@OverridepublicVvalue() {
try {
byte[] valueBytes =
backend.db.get(columnFamily, serializeCurrentKeyWithGroupAndNamespace());
if (valueBytes == null) {
returngetDefaultValue();
}
dataInputView.setBuffer(valueBytes);
returnvalueSerializer.deserialize(dataInputView);
} catch (IOException | RocksDBExceptione) {
thrownewFlinkRuntimeException("Error while retrieving data from RocksDB.", e);
}
}
@Overridepublicvoidupdate(Vvalue) {
if (value == null) {
clear();
return;
}
try {
backend.db.put(
columnFamily,
writeOptions,
serializeCurrentKeyWithGroupAndNamespace(),
serializeValue(value));
} catch (Exceptione) {
thrownewFlinkRuntimeException("Error while adding data to RocksDB", e);
}
}
RocksDBReducingState
classRocksDBReducingState<K, N, V> extendsAbstractRocksDBAppendingState<K, N, V, V, V>
implementsInternalReducingState<K, N, V> {
/** User-specified reduce function. */privatefinalReduceFunction<V> reduceFunction;
/** * Creates a new {@code RocksDBReducingState}. * * @param columnFamily The RocksDB column family that this state is associated to. * @param namespaceSerializer The serializer for the namespace. * @param valueSerializer The serializer for the state. * @param defaultValue The default value for the state. * @param reduceFunction The reduce function used for reducing state. * @param backend The backend for which this state is bind to. */privateRocksDBReducingState(
ColumnFamilyHandlecolumnFamily,
TypeSerializer<N> namespaceSerializer,
TypeSerializer<V> valueSerializer,
VdefaultValue,
ReduceFunction<V> reduceFunction,
RocksDBKeyedStateBackend<K> backend) {
super(columnFamily, namespaceSerializer, valueSerializer, defaultValue, backend);
this.reduceFunction = reduceFunction;
}
@OverridepublicTypeSerializer<K> getKeySerializer() {
returnbackend.getKeySerializer();
}
@OverridepublicTypeSerializer<N> getNamespaceSerializer() {
returnnamespaceSerializer;
}
@OverridepublicTypeSerializer<V> getValueSerializer() {
returnvalueSerializer;
}
@OverridepublicVget() {
returngetInternal();
}
@Overridepublicvoidadd(Vvalue) throwsException {
byte[] key = getKeyBytes();
VoldValue = getInternal(key);
VnewValue = oldValue == null ? value : reduceFunction.reduce(oldValue, value);
updateInternal(key, newValue);
}
@OverridepublicvoidmergeNamespaces(Ntarget, Collection<N> sources) {
if (sources == null || sources.isEmpty()) {
return;
}
try {
Vcurrent = null;
// merge the sources to the targetfor (Nsource : sources) {
if (source != null) {
setCurrentNamespace(source);
finalbyte[] sourceKey = serializeCurrentKeyWithGroupAndNamespace();
finalbyte[] valueBytes = backend.db.get(columnFamily, sourceKey);
if (valueBytes != null) {
backend.db.delete(columnFamily, writeOptions, sourceKey);
dataInputView.setBuffer(valueBytes);
Vvalue = valueSerializer.deserialize(dataInputView);
if (current != null) {
current = reduceFunction.reduce(current, value);
} else {
current = value;
}
}
}
}
// if something came out of merging the sources, merge it or write it to the targetif (current != null) {
// create the target full-binary-keysetCurrentNamespace(target);
finalbyte[] targetKey = serializeCurrentKeyWithGroupAndNamespace();
finalbyte[] targetValueBytes = backend.db.get(columnFamily, targetKey);
if (targetValueBytes != null) {
dataInputView.setBuffer(targetValueBytes);
// target also had a value, mergeVvalue = valueSerializer.deserialize(dataInputView);
current = reduceFunction.reduce(current, value);
}
// serialize the resulting valuedataOutputView.clear();
valueSerializer.serialize(current, dataOutputView);
// write the resulting valuebackend.db.put(
columnFamily, writeOptions, targetKey, dataOutputView.getCopyOfBuffer());
}
} catch (Exceptione) {
thrownewFlinkRuntimeException("Error while merging state in RocksDB", e);
}
}
@SuppressWarnings("unchecked")
static <K, N, SV, SextendsState, ISextendsS> IScreate(
StateDescriptor<S, SV> stateDesc,
Tuple2<ColumnFamilyHandle, RegisteredKeyValueStateBackendMetaInfo<N, SV>>
registerResult,
RocksDBKeyedStateBackend<K> backend) {
return (IS)
newRocksDBReducingState<>(
registerResult.f0,
registerResult.f1.getNamespaceSerializer(),
registerResult.f1.getStateSerializer(),
stateDesc.getDefaultValue(),
((ReducingStateDescriptor<SV>) stateDesc).getReduceFunction(),
backend);
}
}
RocksDB工具类
OpenDB
/** * 指定对应列族**/publicstaticRocksDBopenDB(
Stringpath,
List<ColumnFamilyDescriptor> stateColumnFamilyDescriptors,
List<ColumnFamilyHandle> stateColumnFamilyHandles,
ColumnFamilyOptionscolumnFamilyOptions,
DBOptionsdbOptions)
throwsIOException {
List<ColumnFamilyDescriptor> columnFamilyDescriptors =
newArrayList<>(1 + stateColumnFamilyDescriptors.size());
// we add the required descriptor for the default CF in FIRST position, see// https://github.com/facebook/rocksdb/wiki/RocksJava-Basics#opening-a-database-with-column-families// 创建默认列族columnFamilyDescriptors.add(
newColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, columnFamilyOptions));
// 加入用户提供的状态列族columnFamilyDescriptors.addAll(stateColumnFamilyDescriptors);
RocksDBdbRef;
try {
dbRef =
RocksDB.open(
Preconditions.checkNotNull(dbOptions),
Preconditions.checkNotNull(path),
columnFamilyDescriptors,
stateColumnFamilyHandles);
} catch (RocksDBExceptione) {
IOUtils.closeQuietly(columnFamilyOptions);
columnFamilyDescriptors.forEach((cfd) -> IOUtils.closeQuietly(cfd.getOptions()));
// improve error reporting on WindowsthrowExceptionIfPathLengthExceededOnWindows(path, e);
thrownewIOException("Error while opening RocksDB instance.", e);
}
// requested + default CFPreconditions.checkState(
1 + stateColumnFamilyDescriptors.size() == stateColumnFamilyHandles.size(),
"Not all requested column family handles have been created");
returndbRef;
}