diff --git a/.github/bin/sync_docs.sh b/.github/bin/sync_docs.sh new file mode 100755 index 0000000..0e59a2f --- /dev/null +++ b/.github/bin/sync_docs.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -o pipefail +set -e + +function exit_with_usage { + echo "Usage: ./github/bin/sync_docs.sh " + echo "" + echo "This script synchronizes documentation from the apache/amoro.git repository." + echo "" + echo "ref_name can be:" + echo " - 'master' for the master branch" + echo " - A version number like '0.8.1' which will be converted to tag v0.8.1 or v0.8.1-incubating" + echo " (Script will try both v0.8.1-incubating and v0.8.1 tags automatically)" + echo " - A branch name like 'feature-branch'" + echo "" + echo "target_path is the path where the docs will be copied to, typically 'amoro-docs/content'" + exit 1 +} + +if [ $# -ne 2 ]; then + exit_with_usage +fi + +REF_NAME="$1" +TARGET_PATH="$2" + +# Create temp directory for downloaded content +TEMP_DIR=$(mktemp -d) +trap 'rm -rf ${TEMP_DIR}' EXIT + +# Check if REF_NAME is a version number (e.g., 0.8.1) +if [[ "${REF_NAME}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + # First try with v{version} tag + DOWNLOAD_REF="tags/v${REF_NAME}.tar.gz" + DIR_NAME="amoro-${REF_NAME}" + + echo "Trying to download ${DOWNLOAD_REF} from Apache Amoro repository..." + if ! wget -q "https://github.com/apache/amoro/archive/refs/${DOWNLOAD_REF}" -O "${TEMP_DIR}/download.tar.gz" 2>/dev/null; then + echo "v${REF_NAME} tag not found, trying v${REF_NAME}-incubating tag..." + # Try with v{version}-incubating tag + DOWNLOAD_REF="tags/v${REF_NAME}-incubating.tar.gz" + DIR_NAME="amoro-${REF_NAME}-incubating" + + if ! wget -q "https://github.com/apache/amoro/archive/refs/${DOWNLOAD_REF}" -O "${TEMP_DIR}/download.tar.gz" 2>/dev/null; then + echo "v${REF_NAME}-incubating tag not found, falling back to branch..." + # Fall back to branch + DOWNLOAD_REF="heads/${REF_NAME}.tar.gz" + DIR_NAME="amoro-${REF_NAME}" + wget -q "https://github.com/apache/amoro/archive/refs/${DOWNLOAD_REF}" -O "${TEMP_DIR}/download.tar.gz" || { + echo "Failed to download docs for ${REF_NAME}" + exit 1 + } + fi + fi +elif [ "${REF_NAME}" == "master" ]; then + DOWNLOAD_REF="heads/master.tar.gz" + DIR_NAME="amoro-master" + + echo "Downloading ${DOWNLOAD_REF} from Apache Amoro repository..." + wget -q "https://github.com/apache/amoro/archive/refs/${DOWNLOAD_REF}" -O "${TEMP_DIR}/download.tar.gz" || { + echo "Failed to download docs for master branch" + exit 1 + } +else + # For other branches, use branch name + DOWNLOAD_REF="heads/${REF_NAME}.tar.gz" + DIR_NAME="amoro-${REF_NAME}" + + echo "Downloading ${DOWNLOAD_REF} from Apache Amoro repository..." + wget -q "https://github.com/apache/amoro/archive/refs/${DOWNLOAD_REF}" -O "${TEMP_DIR}/download.tar.gz" || { + echo "Failed to download docs for branch ${REF_NAME}" + exit 1 + } +fi + +echo "Extracting archive..." +tar -xzf "${TEMP_DIR}/download.tar.gz" -C "${TEMP_DIR}" + +echo "Syncing docs to ${TARGET_PATH}..." +# Make sure the target directory exists +mkdir -p "${TARGET_PATH}" +# Clean the target directory +rm -rf "${TARGET_PATH:?}"/* +# Copy the docs to the target directory +cp -r "${TEMP_DIR}/${DIR_NAME}/docs/"* "${TARGET_PATH}/" + +echo "Cleanup..." +rm -rf "${TEMP_DIR}" + +echo "Sync completed successfully!" diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index d2e77f9..57f710b 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -34,6 +34,9 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Sync latest docs from Amoro repository + run: .github/bin/sync_docs.sh master amoro-docs/content + - name: Setup Hugo uses: peaceiris/actions-hugo@v2 with: @@ -58,10 +61,13 @@ jobs: continue-on-error: true steps: - uses: actions/checkout@v2 - + - name: Set output id: vars - run: echo ::set-output name=branch_name::${GITHUB_REF#refs/*/} + run: echo "branch_name=$(echo ${GITHUB_REF#refs/*/})" >> $GITHUB_OUTPUT + + - name: Sync docs from Amoro repository for branch + run: .github/bin/sync_docs.sh ${{ steps.vars.outputs.branch_name }} amoro-docs/content - name: Setup Hugo uses: peaceiris/actions-hugo@v2 @@ -79,3 +85,4 @@ jobs: publish_dir: ./output/docs/${{ steps.vars.outputs.branch_name }} publish_branch: asf-site destination_dir: ./output/docs/${{ steps.vars.outputs.branch_name }} + keep_files: true diff --git a/amoro-docs/content/README.md b/amoro-docs/content/README.md deleted file mode 100644 index 7c40992..0000000 --- a/amoro-docs/content/README.md +++ /dev/null @@ -1,9 +0,0 @@ -## Amoro Docs - -This directory contains the documentation content of Amoro. - -* The documentation is written in Markdown format. -* The images referenced in the documentation are saved in the `images` folder. - -### Style -* Proper nouns should start with a capital letter, like Hadoop、Hive、Iceberg、Amoro \ No newline at end of file diff --git a/amoro-docs/content/_index.md b/amoro-docs/content/_index.md deleted file mode 100644 index b219e33..0000000 --- a/amoro-docs/content/_index.md +++ /dev/null @@ -1,92 +0,0 @@ ---- -title: "Introduction" -menu: main -weight: 0 ---- - -# Introduction - -Amoro is a Lakehouse management system built on open data lake formats. Working with compute engines including Flink, Spark, and Trino, Amoro brings pluggable and self-managed features for Lakehouse to provide out-of-the-box data warehouse experience, and helps data platforms or products easily build infra-decoupled, stream-and-batch-fused and lake-native architecture. - -## Architecture - -The architecture of Amoro is as follows: -

-Architecture -

- -The core components of Amoro include: - -* AMS: Amoro Management Service provides Lakehouse management features, like self-optimizing, data expiration, etc. - It also provides a unified catalog service for all compute engines, which can also be combined with existing metadata services. -* Plugins: Amoro provides a wide selection of external plugins to meet different scenarios. - * Optimizers: The self-optimizing execution engine plugin asynchronously performs merging, sorting, deduplication, - layout optimization, and other operations on all type table format tables. - * Terminal: SQL command-line tools, provide various implementations like local Spark and Kyuubi. - * LogStore: Provide millisecond to second level SLAs for real-time data processing based on message queues like Kafka and Pulsar. - -## Supported table formats - -Amoro can manage tables of different table formats, similar to how MySQL/ClickHouse can choose different storage engines. -Amoro meets diverse user needs by using different table formats. Currently, Amoro supports three table formats: - -* Iceberg format: means using the native table format of the Apache Iceberg, which has all the features and characteristics of Iceberg. -* Mixed-Iceberg format: built on top of Iceberg format, which can accelerate data processing using LogStore - and provides more efficient query performance and streaming read capability in CDC scenarios. -* Mixed-Hive format: has the same features as the Mixed-Iceberg tables but is compatible with a Hive table. - Support upgrading Hive tables to Mixed-Hive tables, and allow Hive's native read and write methods after upgrading. - -## Supported engines - -### Iceberg format - -Iceberg format tables use the engine integration method provided by the Iceberg community. -For details, please refer to: [Iceberg Docs](https://iceberg.apache.org/docs/latest/). - -### Paimon format - -Paimon format tables use the engine integration method provided by the Paimon community. -For details, please refer to: [Paimon Docs](https://paimon.apache.org/docs/master/). - -### Mixed format - -Amoro support multiple processing engines for Mixed format as below: - -| Processing Engine | Version | Batch Read | Batch Write | Batch Overwrite | Streaming Read | Streaming Write | Create Table | Alter Table | -|-------------------|---------------------------|-------------|-------------|-----------------|----------------|-----------------|--------------|-------------| -| Flink | 1.15.x, 1.16.x and 1.17.x | ✔ | ✔ | ✖ | ✔ | ✔ | ✔ | ✖ | -| Spark | 3.2, 3.3, 3.5 | ✔ | ✔ | ✔ | ✖ | ✖ | ✔ | ✔ | -| Hive | 2.x, 3.x | ✔ | ✖ | ✔ | ✖ | ✖ | ✖ | ✔ | -| Trino | 406 | ✔ | ✖ | ✔ | ✖ | ✖ | ✖ | ✔ | - - -## User cases - -### Self-managed streaming Lakehouse - -Amoro makes it easier for users to handle the challenges of writing to a real-time data lake, such as ingesting append-only event logs or CDC data from databases. -In these scenarios, the rapid increase of fragment and redundant files cannot be ignored. -To address this issue, Amoro provides a pluggable streaming data self-optimizing mechanism that automatically compacts fragment files and removes expired data, ensuring high-quality table queries while reducing system costs. - -### Stream-and-batch-fused data pipeline - -Whether in the AI or BI business field , the requirement for real-time analysis is becoming increasingly high. The traditional approach of using one streaming job to complete all data processing from the source to the end is no longer applicable. There is an increasing demand for layered construction of streaming data pipeline, and the traditional layered construction approach based on message queues can cause a inconsistency problem between the streaming and batch data processing. Building a unified stream-and-batch-fused pipeline based on new data lake formats is the future direction for solving these problems. Amoro fully leverages the characteristics of the new data lake table formats about unified streaming and batch processing, not only ensuring the quality of data in the streaming pileline but also enhancing critical features such as incremental reading for CDC data and streaming dimension table association, helping users to build a stream-and-batch-fused data pipeline. - -### Cloud-native Lakehouse - -Currently, most data platforms and products are tightly coupled with their underlying infrastructure(such as the storage layer). The migration of infrastructure, such as switching to cloud-native OSS, may require extensive adaptation efforts or even be impossible. However, Amoro provides an infra-decoupled, lake-native architecture built on top of the infrastructure. This allows products based on Amoro to interact with the underlying infrastructure through a unified interface (Amoro Catalog service), protecting upper-layer products from the impact of infrastructure switch. \ No newline at end of file diff --git a/amoro-docs/content/admin-guides/deployment-on-kubernetes.md b/amoro-docs/content/admin-guides/deployment-on-kubernetes.md deleted file mode 100644 index 97f7951..0000000 --- a/amoro-docs/content/admin-guides/deployment-on-kubernetes.md +++ /dev/null @@ -1,283 +0,0 @@ ---- -title: "Deployment On Kubernetes" -url: deployment-on-kubernetes -aliases: - - "admin-guides/deployment-on-kubernetes" -menu: - main: - parent: Admin Guides - weight: 150 ---- - -# Deploy AMS On Kubernetes - -## Requirements - -If you want to deploy AMS on Kubernetes, you’d better get a sense of the following things. - -- Use AMS official docker image or build AMS docker image -- [An active Kubernetes cluster](https://kubernetes.io/docs/setup/) -- [Kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) -- [Helm3+](https://helm.sh/docs/intro/quickstart/) - -## Amoro Official Docker Image - -You can find the official docker image at [Amoro Docker Hub](https://hub.docker.com/u/apache). - -The following are images that can be used in a production environment. - -**apache/amoro** - -This is an image built based on the Amoro binary distribution package for deploying AMS. - -**apache/amoro-flink-optimizer** - -This is an image built based on the official version of Flink for deploying the Flink optimizer. - -**apache/amoro-spark-optimizer** - -This is an image built based on the official version of Spark for deploying the Spark optimizer. - -## Build AMS Docker Image - -If you want to build images locally, you can find the `build.sh` script in the docker folder of the project and pass the following command: - -```shell -./docker/build.sh amoro -``` - -or build the `amoro-flink-optimizer` image by: - -```shell -./docker/build.sh amoro-flink-optimizer --flink-version -``` - -or build the `amoro-spark-optimizer` image by: - -```shell -./docker/build.sh amoro-spark-optimizer --spark-version -``` - -## Get Helm Charts -You can find the latest charts directly from the Github source code. - -```shell -$ git clone https://github.com/apache/amoro.git -$ cd amoro/charts -$ helm dependency build ./amoro -``` - -## Install - -When you are ready, you can use the following helm command to start - -```shell -$ helm install ./amoro -``` - -After successful installation, you can access WebUI through the following command. - -```shell -$ kubectl port-forward services/-amoro-rest 1630:1630 -``` - -Open browser to go web: http://localhost:1630 - -## Access logs - -Then, use pod name to get logs: - -```shell -$ kubectl get pod -$ kubectl logs {amoro-pod-name} -``` - -## Uninstall - -```shell -$ helm uninstall -``` - - -## Configuring Helm application. - -Helm uses `/values.yaml` files as configuration files, and you can also copy this file for separate maintenance. - -```shell -$ cp amoro/values.yaml my-values.yaml -$ vim my-values.yaml -``` - -And deploy Helm applications using independent configuration files. - -```shell -$ helm install ./amoro -f my-values.yaml -``` - - -### Enable Ingress - -Ingress is not enabled by default. In production environments, it is recommended to enable Ingress to access the AMS Dashboard from outside the cluster. - -```yaml -ingress: - enabled: true - ingressClassName: "nginx" - hostname: minikube.amoro.com -``` - -### Configure the database. - -AMS uses embedded [Apache Derby](https://db.apache.org/derby/) as its backend storage by default. -In production environments, we recommend using a RDBMS(Relational Database Management System) with higher availability guarantees as the storage for system data, you can ref to [Database Configuration](/deployment/#configure-system-database) for more detail. - -```yaml -amoroConf: - database: - type: ${your_database_type} - driver: ${your_database_driver} - url: ${your_jdbc_url} - username: ${your_username} - password: ${your_password} -``` - - -### Configure the Images - -Helm charts deploy images by default using the latest tag. -If you need to modify the image address, such as using a private repository or building your own image - - -```yaml -image: - repository: - pullPolicy: IfNotPresent - tag: -imagePullSecrets: [ ] -``` - -### Configure the Flink Optimizer Container - -By default, the Flink Optimizer Container is enabled. -You can modify the container configuration by changing the `optimizer.flink` section. - -```yaml -optimizer: - flink: - enabled: true - ## container name, default is flink - name: ~ - image: - ## the image repository - repository: apache/amoro-flink-optimizer - ## the image tag, if not set, the default value is the same with amoro image tag. - tag: ~ - pullPolicy: IfNotPresent - ## the location of flink optimizer jar in image. - jobUri: "local:///opt/flink/usrlib/optimizer-job.jar" - properties: { - "flink-conf.taskmanager.memory.managed.size": "32mb", - "flink-conf.taskmanager.memory.network.max": "32mb", - "flink-conf.taskmanager.memory.network.min": "32mb" - } -``` -### Configure the Kubernetes Optimizer Container - -By default, the Kubernetes Optimizer Container is enabled. -You can modify the container configuration by changing the `optimizer.Kubernetes` section. - -```yaml -optimizer: - kubernetes: - # enable the kubernetes optimizer container - enabled: true - properties: - namespace: "default" - kube-config-path: "~/.kube/config" - image: "apache/amoro:latest" - pullPolicy: "IfNotPresent" - # configure additional parameters by using the extra. prefix - # extra.jvm.heap.ratio: "0.8" -``` - -To use PodTemplate, you need to copy and paste the following into the `kubernetes.properties`. - -This is the default podTemplate, and when the user doesn't specify any additional parameters, the default is to use the template's parameters - -Therefore, there will be a priority issue that needs to be elaborated: _Resource(WebUi) > Independent User Profile Configuration > PodTemplate_ - -```yaml -podTemplate: | - apiVersion: apps/v1 - kind: PodTemplate - metadata: - name: - template: - metadata: - labels: - app: - AmoroOptimizerGroup: - AmoroResourceId: - spec: - containers: - - name: optimizer - image: apache/amoro:0.6 - imagePullPolicy: IfNotPresent - command: [ "sh", "-c", "echo 'Hello, World!'" ] - resources: - limits: - memory: 2048Mi - cpu: 2 - requests: - memory: 2048Mi - cpu: 2 - -``` - - -### Configure the RBAC - -By default, Helm Chart creates a service account, role, and role bind for Amoro deploy. -You can also modify this configuration to use an existing account. - -```yaml -# ServiceAccount of Amoro to schedule optimizer. -serviceAccount: - # Specifies whether a service account should be created or using an existed account - create: true - # Annotations to add to the service account - annotations: { } - # Specifies ServiceAccount name to be used if `create: false` - name: ~ - # if `serviceAccount.create` == true. role and role-bind will be created - rbac: - # create a role/role-bind if automatically create service account - create: true - # create a cluster-role and cluster-role-bind if cluster := true - cluster: false -``` - -Notes: - -- If `serviceAccount.create` is false, you must provide a `serviceAccount.name` and create the `serviceAccount` beforehand. -- If `serviceAccount.rbac.create` is false, the role and role-bind will not be created automatically. -- You can set `serviceAccount.rbac.cluster` to true, which will create a `cluster-role` and `cluster-role-bind` instead of a `role` and `role-bind`. - -By default, the `serviceAccount` will be used to create the flink-optimizer. -Therefore, if you need to schedule the flink-optimizer across namespaces, -please create a `cluster-role` or use your own created `serviceAccount`. diff --git a/amoro-docs/content/admin-guides/deployment.md b/amoro-docs/content/admin-guides/deployment.md deleted file mode 100644 index 779e5d2..0000000 --- a/amoro-docs/content/admin-guides/deployment.md +++ /dev/null @@ -1,337 +0,0 @@ ---- -title: "Deployment" -url: deployment -aliases: - - "admin-guides/deployment" -menu: - main: - parent: Admin Guides - weight: 100 ---- - -# Deployment - -You can choose to download the stable release package from [download page](../../../download/), or the source code form [Github](https://github.com/apache/amoro) and compile it according to the README. - -## System requirements - -- Java 8 is required. -- Optional: A RDBMS (PostgreSQL 14.x or higher, MySQL 5.5 or higher) -- Optional: ZooKeeper 3.4.x or higher - -## Download the distribution - -All released package can be downloaded from [download page](../../../download/). -You can download apache-amoro-x.y.z-bin.tar.gz (x.y.z is the release number), and you can also download the runtime packages for each engine version according to the engine you are using. -Unzip it to create the amoro-x.y.z directory in the same directory, and then go to the amoro-x.y.z directory. - -## Source code compilation - -You can build based on the master branch without compiling Trino. The compilation method and the directory of results are described below: - -```shell -$ git clone https://github.com/apache/amoro.git -$ cd amoro -$ base_dir=$(pwd) -$ mvn clean package -DskipTests -$ cd dist/target/ -$ ls -amoro-x.y.z-bin.zip # AMS release package - -$ cd ${base_dir}/amoro-format-mixed/amoro-format-mixed-flink/v1.15/amoro-format-mixed-flink-runtime-1.15/target -$ ls -amoro-format-mixed-flink-runtime-1.15-x.y.z.jar # Flink 1.15 runtime package - -$ cd ${base_dir}/amoro-format-mixed/amoro-format-mixed-spark/v3.2/amoro-format-mixed-spark-runtime-3.2/target -$ ls -amoro-format-mixed-spark-runtime-3.2-x.y.z.jar # Spark v3.2 runtime package) -``` - -More build guide can be found in the project's [README](https://github.com/apache/amoro?tab=readme-ov-file#building). - -## Configuration - -If you want to use AMS in a production environment, it is recommended to modify `{AMORO_HOME}/conf/config.yaml` by referring to the following configuration steps. - -### Configure the service address - -- The `ams.server-bind-host` configuration specifies the host to which AMS is bound. The default value, `0.0.0.0,` indicates binding to all network interfaces. -- The `ams.server-expose-host` configuration specifies the host exposed by AMS that the compute engines and optimizers used to connect to AMS. You can configure a specific IP address on the machine, or an IP prefix. When AMS starts up, it will find the first host that matches this prefix. -- The `ams.thrift-server.table-service.bind-port` configuration specifies the binding port of the Thrift Server that provides the table service. The compute engines access AMS through this port, and the default value is 1260. -- The `ams.thrift-server.optimizing-service.bind-port` configuration specifies the binding port of the Thrift Server that provides the optimizing service. The optimizers access AMS through this port, and the default value is 1261. -- The `ams.http-server.bind-port` configuration specifies the port to which the HTTP service is bound. The Dashboard and Open API are bound to this port, and the default value is 1630. -- The `ams.http-server.rest-auth-type` configuration specifies the REST API auth type, which could be token(default) or basic. The basic auth would reuse `ams.admin-username` and `ams.admin-password` for authentication. - -```yaml -ams: - server-bind-host: "0.0.0.0" #The IP address for service listening, default is 0.0.0.0. - server-expose-host: "127.0.0.1" #The IP address for service external exposure, default is 127.0.0.1. - - thrift-server: - table-service: - bind-port: 1260 #The port for accessing AMS table service. - optimizing-service: - bind-port: 1261 #The port for accessing AMS optimizing service. - - http-server: - session-timeout: 7d #Re-login after 7days - bind-port: 1630 #The port for accessing AMS Dashboard. -``` - -{{< hint info >}} -Make sure the port is not used before configuring it. -{{< /hint >}} - -### Configure system database - -AMS uses embedded [Apache Derby](https://db.apache.org/derby/) as the backend storage by default, so you can use `Derby` directly without any additional configuration. - -You can also configure a relational backend storage as you needed. - -> If you would like to use MySQL as the system database, you need to manually download the [MySQL JDBC Connector](https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.1.0/mysql-connector-j-8.1.0.jar) -and move it into the `${AMORO_HOME}/lib/` directory. - -You need to create an empty database in the RDBMS before to start the server, then AMS will automatically create tables in the database when it first started. - -One thing you need to do is adding configuration under `config.yaml` of Ams: - -```yaml -ams: - database: - type: ${database_type} # postgres or mysql - jdbc-driver-class: ${your_driver_name} - url: ${your_jdbc_url} - username: ${your_username} - password: ${your_password} - auto-create-tables: true -``` - -### Configure high availability - -To improve stability, AMS supports a one-master-multi-backup HA mode. Zookeeper is used to implement leader election, -and the AMS cluster name and Zookeeper address are specified. The AMS cluster name is used to bind different AMS clusters -on the same Zookeeper cluster to avoid mutual interference. - -```yaml -ams: - ha: - enabled: true #Enable HA - cluster-name: default # Differentiating binding multiple sets of AMS on the same ZooKeeper. - zookeeper-address: 127.0.0.1:2181,127.0.0.1:2182,127.0.0.1:2183 # ZooKeeper server address. -``` - -### Configure optimizer containers - -To scale out the optimizer through AMS, container configuration is required. -If you choose to manually start an external optimizer, no additional container configuration is required. AMS will initialize a container named `external` by default to store all externally started optimizers. -AMS provides implementations of `LocalContainer` and `FlinkContainer` by default. Configuration for both container types can be found below: - -```yaml -containers: - - name: localContainer - container-impl: org.apache.amoro.server.manager.LocalOptimizerContainer - properties: - export.JAVA_HOME: "/opt/java" # JDK environment - - - name: flinkContainer - container-impl: org.apache.amoro.server.manager.FlinkOptimizerContainer - properties: - flink-home: "/opt/flink/" # The installation directory of Flink - export.JVM_ARGS: "-Djava.security.krb5.conf=/opt/krb5.conf" # Submitting Flink jobs with Java parameters, such as Kerberos parameters. - export.HADOOP_CONF_DIR: "/etc/hadoop/conf/" # Hadoop configuration file directory - export.HADOOP_USER_NAME: "hadoop" # Hadoop user - export.FLINK_CONF_DIR: "/etc/hadoop/conf/" # Flink configuration file directory - - - name: sparkContainer - container-impl: org.apache.amoro.server.manager.SparkOptimizerContainer - properties: - spark-home: /opt/spark/ # Spark install home - master: yarn # The cluster manager to connect to. See the list of https://spark.apache.org/docs/latest/submitting-applications.html#master-urls. - deploy-mode: cluster # Spark deploy mode, client or cluster - export.JVM_ARGS: -Djava.security.krb5.conf=/opt/krb5.conf # Spark launch jvm args, like kerberos config when ues kerberos - export.HADOOP_CONF_DIR: /etc/hadoop/conf/ # Hadoop config dir - export.HADOOP_USER_NAME: hadoop # Hadoop user submit on yarn - export.SPARK_CONF_DIR: /opt/spark/conf/ # Spark config dir -``` - -More optimizer container configurations can be found in [managing optimizers](../managing-optimizers/). - -### Configure terminal - -The Terminal module in the AMS Dashboard allows users to execute SQL directly on the platform. Currently, the Terminal backend supports two implementations: `local` and `kyuubi`. -In local mode, an embedded Spark environment will be started in AMS. In kyuubi mode, an additional kyuubi service needs to be deployed. -The configuration for kyuubi mode can refer to: [Using Kyuubi with Terminal](../using-kyuubi/). Below is the configuration for the local mode: - -```yaml -ams: - terminal: - backend: local - local.spark.sql.iceberg.handle-timestamp-without-timezone: false - # When the catalog type is Hive, it automatically uses the Spark session catalog to access Hive tables. - local.using-session-catalog-for-hive: true -``` - -More properties the terminal supports including: - -| Key | Default | Description | -|--------------------------|---------|---------------------------------------------------------------------------------------------------| -| terminal.backend | local | Terminal backend implementation. local, kyuubi and custom are valid values. | -| terminal.factory | - | Session factory implement of terminal, `terminal.backend` must be `custom` if this is set. | -| terminal.result.limit | 1000 | Row limit of result-set | -| terminal.stop-on-error | false | When a statement fails to execute, stop execution or continue executing the remaining statements. | -| terminal.session.timeout | 30 | Session timeout in minutes. | - -### Configure metric reporter - -Amoro provides metric reporters by plugin mechanism to connect to external metric systems. - -All metric-reporter plugins are configured in `$AMORO_CONF_DIR/plugins/metric-repoters.yaml` . - -The configuration format of the plug-in is: - -```yaml - -metric-reporters: - - name: # the unified plugin name. - enabled: # if this plugin is enabled, default is true. - properties: # a map defines properties of plugin. -``` - -Currently, there is only one reporter is available. - -#### Prometheus Exporter - -By enable the `prometheus-exporter` plugin, the AMS will start a prometheus http exporter server. - -```yaml -metric-reporters: - - name: prometheus-exporter # configs for prometheus exporter - enabled: true - properties: - port: 9090 # the port that the prometheus-exporter listens on. -``` - -You can add a scrape job in your prometheus configs - -```yaml -# Your prometheus configs file. -scrape_configs: - - job_name: 'amoro-exporter' - scrape_interval: 15s - static_configs: - - targets: ['localhost:9090'] # The host and port that you configured in Amoro plugins configs file. -``` - -### Configure encrypted configuration items -For enhanced security, AMS supports encrypted values for sensitive configuration items such as passwords within `config.yaml`. This prevents plaintext passwords and other critical information from being directly exposed in the configuration file. -Currently, AMS provides built-in support for base64 decryption, and users can also implement custom decryption algorithms if needed (see [Using Customized Encryption Method for Configurations](../using-customized-encryption-method/)). - -To enable encrypted sensitive configuration items, add the following configurations under `config.yaml` of AMS: -- The `ams.shade.identifier` configuration specifies the encryption method used for the sensitive values. The default value is `default`, which means no encryption is applied. To enable encrypted values, set it to `base64` or another supported encryption method. -- The `ams.shade.sensitive-keywords` configuration specifies which configuration items under `ams` are encrypted. The default value is `admin-password;database.password`, and multiple keywords should be separated by semicolons (`;`). The values of these items must be replaced with their encrypted counterparts. - -Example Configuration (Partial): -```yaml -ams: - admin-username: admin - admin-password: YWRtaW4= # Ciphertext for "admin" - server-bind-host: "0.0.0.0" - server-expose-host: "127.0.0.1" - - shade: - identifier: base64 - sensitive-keywords: admin-password;database.password - - database: - type: mysql - jdbc-driver-class: com.mysql.cj.jdbc.Driver - url: jdbc:mysql://127.0.0.1:3306/amoro?useUnicode=true&characterEncoding=UTF8&autoReconnect=true&useAffectedRows=true&allowPublicKeyRetrieval=true&useSSL=false - username: root - password: cGFzc3dvcmQ= # Ciphertext for "password" -``` - - -### Environments variables - -The following environment variables take effect during the startup process of AMS, -you can set up those environments to overwrite the default value. - -| Environments variable name | Default value | Description | -|----------------------------|--------------------|--------------------------------------------| -| AMORO_CONF_DIR | ${AMORO_HOME}/conf | location where Amoro loading config files. | -| AMORO_LOG_DIR | ${AMORO_HOME}/logs | location where the logs files output | - -Note: `$AMORO_HOME` can't be overwritten from environment variable. It always points to the parent dir of `./bin`. - -### Configure AMS JVM - -The following JVM options could be set in `${AMORO_CONF_DIR}/jvm.properties`. - -| Property Name | Related Jvm option | Description | -|-----------------|------------------------------------------------|--------------------------| -| xms | "-Xms${value}m | Xms config for jvm | -| xmx | "-Xmx${value}m | Xmx config for jvm | -| jmx.remote.port | "-Dcom.sun.management.jmxremote.port=${value} | Enable remote debug | -| extra.options | "JAVA_OPTS="${JAVA_OPTS} ${JVM_EXTRA_CONFIG}" | The addition jvm options | - -## Start AMS - -Enter the directory amoro-x.y.z and execute bin/ams.sh start to start AMS. - -```shell -$ cd amoro-x.y.z -$ bin/ams.sh start -``` - -Then, access http://localhost:1630 through a browser to see the login page. If it appears, it means that the startup is -successful. The default username and password for login are both "admin". - -You can also restart/stop AMS with the following command: - -```shell -$ bin/ams.sh restart -$ bin/ams.sh stop -``` - -## Upgrade AMS - -### Upgrade system databases - -You can find all the upgrade SQL scripts under `${AMORO_HOME}/conf/${db_type}/` with name pattern `upgrade-a.b.c-to-x.y.z.sql`. -Execute the upgrade SQL scripts one by one to your system database based on your starting and target versions. - -### Replace all libs and plugins - -Replace all contents in the original `{AMORO_HOME}/lib` directory with the contents in the lib directory of the new installation package. -Replace all contents in the original `{AMORO_HOME}/plugin` directory with the contents in the plugin directory of the new installation package. - -{{< hint info >}} -Backup the old content before replacing it, so that you can roll back the upgrade operation if necessary. -{{< /hint >}} - -### Configure new properties - -The old configuration file `{AMORO_HOME}/conf/config.yaml` is usually compatible with the new version, but the new version may introduce new parameters. Try to compare the configuration files of the old and new versions, and reconfigure the parameters if necessary. - -### Restart AMS - -Restart AMS with the following commands: -```shell -bin/ams.sh restart -``` - diff --git a/amoro-docs/content/admin-guides/managing-catalogs.md b/amoro-docs/content/admin-guides/managing-catalogs.md deleted file mode 100644 index c29abb6..0000000 --- a/amoro-docs/content/admin-guides/managing-catalogs.md +++ /dev/null @@ -1,95 +0,0 @@ ---- -title: "Managing Catalogs" -url: managing-catalogs -aliases: - - "admin-guides/managing-catalogs" -menu: - main: - parent: Admin Guides - weight: 200 ---- - -# Managing Catalogs - -Users can import your test or online clusters through the catalog management function provided by the AMS Dashboard. Before adding a new Catalog, -please read the following guidelines and select the appropriate creation according to your actual needs. - -## Create catalog -In Amoro, the catalog is a namespace for a group of libraries and tables. Under the catalog, it is further divided into different databases, and under each database, there are different tables. The name of a table in Amoro is uniquely identified by the format `catalog.database.table`. In practical applications, a catalog generally corresponds to a metadata service, such as the commonly used Hive Metastore in big data. - -AMS can also serve as a metadata service. In order to differentiate the storage method of metadata, Amoro classifies the catalog type into `Internal Catalog` and `External Catalog`. Catalogs that use AMS as the metadata service are internal catalogs, while others are external catalogs. When creating an external catalog, you need to select the storage backend for its metadata, such as Hive, Hadoop, or Custom. - -In addition, when defining a catalog, you also need to select the table format used under it. Currently, Amoro supports the following table formats: -[Iceberg](../iceberg-format/) 、[Paimon](../paimon-format)、[Mixed-Hive](../mixed-hive-format/)、[Mixed-Iceberg](../mixed-iceberg-format/). - -You can create a catalog in the AMS frontend: -![create_catalog](../images/admin/create-catalog.png) - -### Configure basic information - -- name: catalog name, only numbers, letters, _, - , starting with letters are supported (lower case letters are recommended) -- type: Internal Catalog or External Catalog -- metastore: storage type for table metadata. Hive Metastore (for using HMS to store metadata), Hadoop (corresponding to iceberg's Hadoop catalog), Glue (for using AWS Glue to store metadata), Custom (other iceberg catalog implementations). -- table format: Iceberg 、Paimon、Mixed-Hive、Mixed-Iceberg. -- optimizer group: tables under the catalog will automatically perform self-optimizing within this group. - -### Configure storage -- Type: Hadoop or S3 -- core-site: the core-site.xml of the hadoop cluster -- hdfs-site: the hdfs-site.xml of the hadoop cluster -- hive-site: the hive-site.xml for Hive -- Region: region of the S3 bucket -- Endpoint: endpoint of the S3 bucket - -### Configure authentication -- Type: SIMPLE, KERBEROS, AK/SK or CUSTOM -- hadoop username: username of the hadoop cluster -- keytab: keytab file -- principal: principal of keytab -- krb5: Kerberos krb5.conf configuration file -- Access Key: Access Key for S3 -- Secret Key: Secret Access Key for S3 - -### Configure properties -Common properties include: -- warehouse: Warehouse **must be configured** for ams/hadoop/glue catalog, as it determines where our database and table files should be placed -- catalog-impl: when the metastore is **Custom**, an additional catalog-impl must be defined, and the user must put the jar package for the custom catalog implementation into the **{AMORO_HOME}/lib** directory, **and the service must be restarted to take effect** -- clients: Hive Catalog connection pool size for accessing HiveMetaStore, default configuration is 20, requires restarting Amoro to take effect. -- database-filter: Configure a regular expression to filter databases in the catalog. If not set then all databases will be displayed in table menu. -- table-filter: Configure a regular expression to filter tables in the catalog. The matching will be done in the format of `database.table`. For example, if it is set to `(A\.a)|(B\.b)`, it will ignore all tables except for table `a` in database `A` and table `b` in database `B` - -### Configure table properties -If you want to add the same table properties to all tables under a catalog, you can add these table properties here on the catalog level. If you also configure this property on the table level, the property on the table will take effect. - -## REST Catalog -When a user needs to create a Iceberg REST Catalog, they can choose **External Catalog Type**、**Custom Metastore Type**、**Iceberg Table Format**, configure properties include: -**catalog-impl=org.apache.iceberg.rest.RESTCatalog**, **uri=$restCatalog_uri**. - -After configuring the above parameters, the final result in the AMS frontend will look like this: -![create-rest_catalog](../images/admin/create-rest_catalog.png) - -### Nessie's REST Catalog -When a user needs to create a Nessie Rest Catalog, they can also set **catalog-impl=org.apache.iceberg.nessie.NessieCatalog** on top of the above parameters. - -## Delete catalog -When a user needs to delete a Catalog, they can go to the details page of the Catalog and click the Remove button at the bottom of the page to perform the deletion. - -{{< hint info >}} -Before deleting an internal catalog, AMS will verify whether there is metadata for tables under that Catalog. -If there are still tables under that Catalog, AMS will prompt that the deletion failed. -{{< /hint >}} diff --git a/amoro-docs/content/admin-guides/managing-optimizers.md b/amoro-docs/content/admin-guides/managing-optimizers.md deleted file mode 100644 index 61d86ab..0000000 --- a/amoro-docs/content/admin-guides/managing-optimizers.md +++ /dev/null @@ -1,380 +0,0 @@ ---- -title: "Managing Optimizers" -url: managing-optimizers -aliases: - - "admin-guides/managing-optimizers" -menu: - main: - parent: Admin Guides - weight: 300 ---- - -# Managing Optimizers - -The optimizer is the execution unit for performing self-optimizing tasks on a table. To isolate optimizing tasks on different tables and support the deployment of optimizers in different environments, Amoro has proposed the concepts of optimizer containers and optimizer groups: - -* Optimizer container: Encapsulate the deployment method of optimizers, there are four implementations for now: `flink container` based on Flink streaming job, `spark container` based on Spark job, `local container` based on Java Application, and `external container` based on manually started by users. -* Optimizer group: A collection of optimizers, where each table must select an optimizer group to perform optimizing tasks on it. Tables under the same optimizer group contribute resources to each other, and tables under different optimizer groups can be isolated in terms of optimizer resources. -* Optimizer: The specific unit that performs optimizing tasks, usually with multiple concurrent units. - -## Optimizer container -Before start exploring self-optimizing, you need to configure the container information in the configuration file. Optimizer container represents a specific set of runtime environment configuration. The supported container types include: local, kubernetes, flink, spark, and external. - -### Local container -Local container is a way to start Optimizer by local process and supports multi-threaded execution of Optimizer tasks. It is recommended to be used only in demo or local deployment scenarios. If the environment variable for jdk is not configured, the user can configure java_home to point to the jdk root directory. If already configured, this configuration item can be ignored. - -Local container support the following properties: - -| Property Name | Required | Default Value | Description | -|-----------------------|----------|---------------|---------------------------------------------------------------------------------------------------------------| -| ams-optimizing-uri | false | N/A | URI of AMS thrift self-optimizing endpoint. This could be used if the ams.server-expose-host is not available | -| export.JAVA_HOME | false | N/A | Java runtime location | - -```yaml -containers: - - name: localContainer - container-impl: org.apache.amoro.server.manager.LocalOptimizerContainer - properties: - export.JAVA_HOME: "/opt/java" # JDK environment -``` - -The format for optimizing URI is `thrift://{host}:{port}?parameter1=value2¶meter2=value2`. -The supported parameters include: - -| Parameter Name | Default Value | Description | -|----------------|-------------------|------------------------------------------------------------| -| autoReconnect | true | If reconnect the server when the connection is broken | -| maxReconnects | 5 | Retry times when reconnecting | -| connectTimeout | 0 (Forever) | Timeout in milliseconds when connecting the server | -| socketTimeout | 0 (Forever) | Timeout in milliseconds when communicating with the server | -| maxMessageSize | 104856600 (100MB) | Max message size when communicating with the server | -| maxMessageSize | 104856600 (100MB) | Max message size when communicating with the server | -| minIdle | 0 | Minimal idle clients in the pool | -| maxIdle | 5 | Maximal idle clients in the pool | - -### Kubernetes container - -Kubernetes container is a way to start Optimizer On K8s with standalone Optimizer. -To use Kubernetes container, you need to add a new container configuration. -with container-impl as `org.apache.amoro.server.manager.KubernetesOptimizerContainer` - -Kubernetes container support the following properties: - - -| Property Name | Required | Default Value | Description | -|---------------------------|----------|---------------|---------------------------------------------------------------------------------------------------------------| -| kube-config-path | true | N/A | Kubernetes config location | -| image | true | N/A | Optimizer Image name | -| pullPolicy | false | IfNotPresent | Specify the imagePullPolicy in the container spec | -| namespace | false | "default" | The namespace of optimizer to deploy | -| ams-optimizing-uri | false | N/A | URI of AMS thrift self-optimizing endpoint. This could be used if the ams.server-expose-host is not available | -| cpu.factor | false | "1.0" | CPU factor when request kubernetes resource. Default 1 Cpu pre thread | -| memory | true | N/A | Memory usage for pre thread | -| extra.jvm.heap.ratio | false | 0.8 | The ratio of JVM heap memory to total pod memory | - -```yaml -containers: - - name: KubernetesContainer - container-impl: org.apache.amoro.server.manager.KubernetesOptimizerContainer - properties: - kube-config-path: ~/.kube/config - image: apache/amoro:{version} - pullPolicy: IfNotPresent -``` - -### Flink container -Flink container is a way to start Optimizer through Flink jobs. With Flink, you can easily deploy Optimizer -on yarn clusters or kubernetes clusters to support large-scale data scenarios. To use flink container, -you need to add a new container configuration. with container-impl as `org.apache.amoro.server.manager.FlinkOptimizerContainer` - -Flink container support the following properties: - -| Property Name | Required | Default Value | Description | -|---------------------------|----------|------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| flink-home | true | N/A | Flink installation location | -| target | true | yarn-per-job | flink job deployed target, available values `yarn-per-job`, `yarn-application`, `kubernetes-application`, `session` | -| job-uri | false | N/A | The jar uri of flink optimizer job. This is required if target is application mode. | -| ams-optimizing-uri | false | N/A | uri of AMS thrift self-optimizing endpoint. This could be used if the ams.server-expose-host is not available | -| export.\ | false | N/A | environment variables will be exported during job submit | -| export.JAVA_HOME | false | N/A | Java runtime location | -| export.HADOOP_CONF_DIR | false | N/A | Direction which holds the configuration files for the hadoop cluster (including hdfs-site.xml, core-site.xml, yarn-site.xml ). If the hadoop cluster has kerberos authentication enabled, you need to prepare an additional krb5.conf and a keytab file for the user to submit tasks | -| export.JVM_ARGS | false | N/A | you can configure flink to run additional configuration parameters, here is an example of configuring krb5.conf, specify the address of krb5.conf to be used by Flink when committing via `-Djava.security.krb5.conf=/opt/krb5.conf` | -| export.HADOOP_USER_NAME | false | N/A | the username used to submit tasks to yarn, used for simple authentication | -| export.FLINK_CONF_DIR | false | N/A | the directory where flink_conf.yaml is located | -| flink-conf.\ | false | N/A | [Flink Configuration Options](https://nightlies.apache.org/flink/flink-docs-master/docs/deployment/config/) will be passed to cli by `-Dkey=value`, | - -{{< hint info >}} -To better utilize the resources of Flink Optimizer, it is recommended to add the following configuration to the Flink Optimizer Group: -* Set `flink-conf.taskmanager.memory.managed.size` to `32mb` as Flink optimizer does not have any computation logic, it does not need to occupy managed memory. -* Set `flink-conf.taskmanager.memory.network.max` to `32mb` as there is no need for communication between operators in Flink Optimizer. -* Set `flink-conf.taskmanager.memory.network.min` to `32mb` as there is no need for communication between operators in Flink Optimizer. -{{< /hint >}} - - -An example for yarn-per-job mode: - -```yaml -containers: - - name: flinkContainer - container-impl: org.apache.amoro.server.manager.FlinkOptimizerContainer - properties: - flink-home: /opt/flink/ #flink install home - export.HADOOP_CONF_DIR: /etc/hadoop/conf/ #hadoop config dir - export.HADOOP_USER_NAME: hadoop #hadoop user submit on yarn - export.JVM_ARGS: -Djava.security.krb5.conf=/opt/krb5.conf #flink launch jvm args, like kerberos config when ues kerberos - export.FLINK_CONF_DIR: /etc/hadoop/conf/ #flink config dir -``` - -An example for kubernetes-application mode: - -```yaml -containers: - - name: flinkContainer - container-impl: org.apache.amoro.server.manager.FlinkOptimizerContainer - properties: - flink-home: /opt/flink/ # Flink install home - target: kubernetes-application # Flink run as native kubernetes - pullPolicy: IfNotPresent # Specify the imagePullPolicy in the container spec - job-uri: "local:///opt/flink/usrlib/optimizer-job.jar" # Optimizer job main jar for kubernetes application - ams-optimizing-uri: thrift://ams.amoro.service.local:1261 # AMS optimizing uri - export.FLINK_CONF_DIR: /opt/flink/conf/ # Flink config dir - flink-conf.kubernetes.container.image: "apache/amoro-flink-optimizer:{version}" # Optimizer image ref - flink-conf.kubernetes.service-account: flink # Service account that is used within kubernetes cluster. -``` - -An example for flink session mode: - -```yaml -containers: - - name: flinkContainer - container-impl: org.apache.amoro.server.manager.FlinkOptimizerContainer - properties: - target: session # Flink run in session cluster - job-uri: "local:///opt/flink/usrlib/optimizer-job.jar" # Optimizer job main jar - ams-optimizing-uri: thrift://ams.amoro.service.local:1261 # AMS optimizing uri - export.FLINK_CONF_DIR: /opt/flink/conf/ # Flink config dir, flink-conf.yaml should e in this dir, contains the rest connection parameters of the session cluster - flink-conf.high-availability: zookeeper # Flink high availability mode, reference: https://nightlies.apache.org/flink/flink-docs-release-1.18/docs/deployment/config/#high-availability - flink-conf.high-availability.zookeeper.quorum: xxx:2181 - flink-conf.high-availability.zookeeper.path.root: /flink - flink-conf.high-availability.cluster-id: amoro-optimizer-cluster - flink-conf.high-availability.storageDir: hdfs://xxx/xxx/xxx - flink-conf.rest.address: localhost:8081 # If the session cluster is not high availability mode, please configure the restaddress of jobmanager -``` - - -### Spark container -Spark container is another way to start Optimizer through Spark jobs. With Spark, you can easily deploy Optimizer -on yarn clusters or kubernetes clusters to support large-scale data scenarios. To use spark container, -you need to add a new container configuration. with container-impl as `org.apache.amoro.server.manager.SparkOptimizerContainer` - -Spark container support the following properties: - -| Property Name | Required | Default Value | Description | -|-------------------------|----------|---------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| spark-home | true | N/A | Spark installation location | -| master | true | yarn | The cluster manager to connect to, available values `yarn`, `k8s://HOST:PORT` | -| deploy-mode | true | client | Spark job deploy mode, available values `client`, `cluster` | -| job-uri | false | N/A | The jar uri of spark optimizer job. This is required if deploy mode is cluster mode. | -| ams-optimizing-uri | false | N/A | uri of AMS thrift self-optimizing endpoint. This could be used if the ams.server-expose-host is not available | -| export.\ | false | N/A | Environment variables will be exported during job submit | -| export.JAVA_HOME | false | N/A | Java runtime location | -| export.HADOOP_CONF_DIR | false | N/A | Direction which holds the configuration files for the hadoop cluster (including hdfs-site.xml, core-site.xml, yarn-site.xml ). If the hadoop cluster has kerberos authentication enabled, you need to prepare an additional krb5.conf and a keytab file for the user to submit tasks | -| export.JVM_ARGS | false | N/A | You can configure spark to run additional configuration parameters, here is an example of configuring krb5.conf, specify the address of krb5.conf to be used by Spark when committing via `-Djava.security.krb5.conf=/opt/krb5.conf` | -| export.HADOOP_USER_NAME | false | N/A | The username used to submit tasks to yarn, used for simple authentication | -| export.SPARK_CONF_DIR | false | N/A | The directory where spark_conf.yaml is located | -| spark-conf.\ | false | N/A | [Spark Configuration Options](https://spark.apache.org/docs/latest/configuration.html) will be passed to cli by `-conf key=value`, | - -{{< hint info >}} -To better utilize the resources of Spark Optimizer, the DRA(Dynamic Resource Allocation) feature is switched on, and the optimizer parallelism equals `spark.dynamicAllocation.maxExecutors. -If you don't want this feature, you can use these settings: -* Set `spark-conf.spark.dynamicAllocation.enabled` to `false` as you need allocate proper driver/executor resources Using [Spark Configuration Options](https://spark.apache.org/docs/latest/configuration.html). -* Set `spark-conf.spark.dynamicAllocation.maxExecutors` to `10` as optimizer parallelism can only affect parallelism polling optimizing tasks from AMS. -{{< /hint >}} - -{{< hint info >}} -The spark optimizer may fail due to class conflicts sometimes, you can try to fix by following the steps below: -* Set `spark-conf.spark.driver.userClassPathFirst` to `true`. -* Set `spark-conf.spark.executor.userClassPathFirst` to `true`. -{{< /hint >}} - -An example for yarn client mode: - -```yaml -containers: - - name: sparkContainer - container-impl: org.apache.amoro.server.manager.SparkOptimizerContainer - properties: - spark-home: /opt/spark/ # Spark install home - master: yarn # The k8s cluster manager to connect to - deploy-mode: client # Spark run as client mode - export.HADOOP_CONF_DIR: /etc/hadoop/conf/ # Hadoop config dir - export.HADOOP_USER_NAME: hadoop # Hadoop user submits on yarn - export.JVM_ARGS: -Djava.security.krb5.conf=/opt/krb5.conf # Spark launch jvm args, like kerberos config when ues kerberos - export.SPARK_CONF_DIR: /etc/hadoop/conf/ # Spark config dir -``` - -An example for kubernetes cluster mode: - -```yaml -containers: - - name: sparkContainer - container-impl: org.apache.amoro.server.manager.SparkOptimizerContainer - properties: - spark-home: /opt/spark/ # Spark install home - master: k8s://https://: # The k8s cluster manager to connect to - deploy-mode: cluster # Spark deploy mode, client or cluster - pullPolicy: IfNotPresent # Specify the imagePullPolicy in the container spec - job-uri: "local:///opt/spark/usrlib/optimizer-job.jar" # Optimizer job main jar for kubernetes application - ams-optimizing-uri: thrift://ams.amoro.service.local:1261 # AMS optimizing uri - export.HADOOP_USER_NAME: hadoop # Hadoop user submits on yarn - export.HADOOP_CONF_DIR: /etc/hadoop/conf/ # Hadoop config dir - export.SPARK_CONF_DIR: /opt/spark/conf/ # Spark config dir - spark-conf.spark.kubernetes.container.image: "apache/amoro-spark-optimizer:{version}" # Optimizer image ref - spark-conf.spark.dynamicAllocation.enabled: "true" # Enabling DRA feature can make full use of computing resources - spark-conf.spark.shuffle.service.enabled: "false" # If spark DRA is used on kubernetes, we should set it false - spark-conf.spark.dynamicAllocation.shuffleTracking.enabled: "true" # Enables shuffle file tracking for executors, which allows dynamic allocation without the need for an ESS - spark-conf.spark.kubernetes.namespace: # Namespace that is used within kubernetes cluster - spark-conf.spark.kubernetes.authenticate.driver.serviceAccountName: # Service account that is used within kubernetes cluster -``` - - -### External container - -External container refers to the way in which the user manually starts the optimizer. The system has a built-in external container called `external`, so you don't need to configure it manually. - -## Optimizer group -Optimizer group (optimizer resource group) is a concept introduced to divide Optimizer resources. An Optimizer Group can -contain several optimizers with the same container implementation to facilitate the expansion and contraction of the resource group. - -### Add optimizer group - -You can add an optimizer group on the Amoro dashboard by following these steps: - -1.Click the "Add Group" button in the top left corner of the `Optimizer Groups` page. -![add-optimizer-group](../images/admin/add-optimizer-group.png) - -2.Configure the newly added Optimizer group. -![config-optimizer-group](../images/admin/config-optimizer-group.png) - -The following configuration needs to be filled in: - -- name: the name of the optimizer group, which can be seen in the list of optimizer groups on the front-end page. -- container: the name of a container configured in containers. -- properties: the default configuration under this group, is used as a configuration parameter for tasks when the optimize page is scaled out. Supports native parameters for `flink on yarn`, and users can set parameters using the `flink-conf.=` or use `flink-conf.yaml` to configure parameters. Supports native parameters for `spark on yarn`, and users can set parameters using the `spark-conf.=` or use `spark-defaults.conf` to configure parameters. - -The optimizer group supports the following properties: - -| Property | Container type | Required | Default | Description | -|--------------------------------|----------------|----------|---------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| scheduling-policy | All | No | quota | The scheduler group scheduling policy, the default value is `quota`, it will be scheduled according to the quota resources configured for each table, the larger the table quota is, the more optimizer resources it can take. There is also a configuration `balanced` that will balance the scheduling of each table, the longer the table has not been optimized, the higher the scheduling priority will be. | -| memory | Local | Yes | N/A | The max memory of JVM for local optimizer, in MBs. | -| max-input-file-size-per-thread | All | No | 536870912(512MB) | Max input file size per optimize thread. | -| ams-optimizing-uri | All | No | thrift://{ams.server-expose-host}:{ams.thrift-server.optimizing-service.binding-port} | Table optimizing service endpoint. This is used when the default service endpoint is not visitable. | -| flink-conf.\ | Flink | No | N/A | Any flink config options could be overwritten, priority is optimizing-group > optimizing-container > flink-conf.yaml. | -| spark-conf.\ | Spark | No | N/A | Any spark config options could be overwritten, priority is optimizing-group > optimizing-container > spark-defaults.conf. | - -{{< hint info >}} -To better utilize the resources of Flink Optimizer, it is recommended to add the following configuration to the Flink Optimizer Group: -* Set `flink-conf.taskmanager.memory.managed.size` to `32mb` as Flink optimizer does not have any computation logic, it does not need to occupy managed memory. -* Set `flink-conf.taskmanager.memory.network.max` to `32mb` as there is no need for communication between operators in Flink Optimizer. -* Set `flink-conf.taskmanager.memory.network.min` to `32mb` as there is no need for communication between operators in Flink Optimizer. -{{< /hint >}} - -### Edit optimizer group - -You can click the `edit` button on the `Optimizer Groups` page to modify the configuration of the Optimizer group. - -### Remove optimizer group - -You can click the `remove` button on the `Optimizer Groups` page to delete the optimizer group, but only if the group is -not referenced by any catalog or table and no optimizer belonging to this group is running. - -## Optimizer Create and Release - -### Create optimizer -You can click the `Create Optimizer` button on the `Optimizers` page to create the optimizer for the corresponding optimizer -group, and then click `OK` to start the optimizer for this optimizer group according to the parallelism configuration. -If the optimizer runs normally, you will see an optimizer with the status `RUNNING` on the `Optimizers` page. - -![optimize-create](../images/admin/optimizer_create.png) - -### Release optimizer - -You can click the `Release` button on the `Optimizer` page to release the optimizer. - -![release optimizer](../images/admin/optimizer_release.png) - -{{< hint info >}} -Currently, only optimizer scaled through the dashboard can be released on dashboard. -{{< /hint >}} - -### Deploy external optimizer - -You can submit optimizer in your own Flink task development platform or local Flink environment with the following configuration. The main parameters include: - -```shell -./bin/flink run-application -t yarn-application \ - -Djobmanager.memory.process.size=1024mb \ - -Dtaskmanager.memory.process.size=2048mb \ - -Dtaskmanager.memory.managed.size=32mb \ - -Dtaskmanager.memory.network.max=32mb \ - -Dtaskmanager.memory.network.min=32mb \ - -c org.apache.amoro.optimizer.flink.FlinkOptimizer \ - ${AMORO_HOME}/plugin/optimizer/flink/optimizer-job.jar \ - -a thrift://127.0.0.1:1261 \ - -g flinkGroup \ - -p 1 -``` -The description of the relevant parameters is shown in the following table: - -| Property | Required | Description | -|----------|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| -a | Yes | The address of the AMS thrift service, for example: thrift://127.0.0.1:1261, can be obtained from the config.yaml configuration. | -| -g | Yes | Group name created in advance under external container. | -| -p | Yes | Optimizer parallelism usage. | -| -hb | No | Heart beat interval with ams, should be smaller than configuration ams.optimizer.heart-beat-timeout in AMS configuration conf/config.yaml which is 60000 milliseconds by default, default 10000(ms). | -| -eds | No | Whether extend storage to disk, default false. | -| -dsp | No | Defines the directory where the storage files are saved, the default temporary-file directory is specified by the system property `java.io.tmpdir`. On UNIX systems the default value of this property is typically "/tmp" or "/var/tmp". | -| -msz | No | Memory storage size limit when extending disk storage(MB), default 512(MB). | - - -Or you can submit optimizer in your own Spark task development platform or local Spark environment with the following configuration. The main parameters include: - -```shell -./bin/spark-submit --master yarn --deploy-mode cluster \ - --conf "spark.driver.cores=1" \ - --conf "spark.driver.memory=g" \ - --conf "spark.executor.cores=1" \ - --conf "spark.executor.memory=2g" \ - --class org.apache.amoro.optimizer.spark.SparkOptimizer \ - ${AMORO_HOME}/plugin/optimizer/spark/optimizer-job.jar \ - -a thrift://127.0.0.1:1261 \ - -g sparkGroup \ - -p 1 -``` -The description of the relevant parameters is shown in the following table: - -| Property | Required | Description | -|----------|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| -a | Yes | The address of the AMS thrift service, for example: thrift://127.0.0.1:1261, can be obtained from the config.yaml configuration. | -| -g | Yes | Group name created in advance under external container. | -| -p | Yes | Optimizer parallelism usage. | -| -hb | No | Heart beat interval with ams, should be smaller than configuration ams.optimizer.heart-beat-timeout in AMS configuration conf/config.yaml which is 60000 milliseconds by default, default 10000(ms). | -| -eds | No | Whether extend storage to disk, default false. | -| -dsp | No | Defines the directory where the storage files are saved, the default temporary-file directory is specified by the system property `java.io.tmpdir`. On UNIX systems the default value of this property is typically "/tmp" or "/var/tmp". | -| -msz | No | Memory storage size limit when extending disk storage(MB), default 512(MB). | diff --git a/amoro-docs/content/admin-guides/using-customized-encryption-method-for-configurations.md b/amoro-docs/content/admin-guides/using-customized-encryption-method-for-configurations.md deleted file mode 100644 index 30235ab..0000000 --- a/amoro-docs/content/admin-guides/using-customized-encryption-method-for-configurations.md +++ /dev/null @@ -1,135 +0,0 @@ ---- -title: "Using Customized Encryption Method for Configurations" -url: using-customized-encryption-method -aliases: - - "admin-guides/using-customized-encryption-method" -menu: - main: - parent: Admin Guides - weight: 400 ---- - -# Using Customized Encryption Method for Configurations -To enhance security, AMS allows encrypted sensitive configuration items such as passwords. Currently, AMS only supports the built-in base64 encryption algorithm (see [Configure encrypted configuration items](../deployment/#configure-encrypted-sensitive-configuration-items) for details). If you require a stronger or customized encryption method, AMS also provides the flexibility to implement your own encryption algorithm. -## Develop the Custom Implementation -To integrate a custom encryption algorithm, you need to create a Java class that implements the `ConfigShade` interface and package it as a service. -### Add Maven Dependency - If using a Maven project, add the following dependency to your `pom.xml`: -```xml - - org.apache.amoro - amoro-common - ${amoro.version} - provided - -``` - -### Implement the `ConfigShade` Interface -Create a Java class that implements the `ConfigShade` interface. This class will handle decryption for sensitive configuration values. - -```java -/** - * The interface that provides the ability to decrypt {@link - * org.apache.amoro.config.Configurations}. - */ -public interface ConfigShade { - /** - * Initializes the custom instance using the service configurations. - * - * This method can be useful when decryption requires an external file (e.g. a key file) - * defined in the service configs. - */ - default void initialize(Configurations serviceConfig) throws Exception {} - - /** - * The unique identifier of the current interface, used it to select the correct {@link - * ConfigShade}. - */ - String getIdentifier(); - - /** - * Decrypt the content. - * - * @param content The content to decrypt - */ - String decrypt(String content); -} -``` -In this interface: - -- `getIdentifier()`: Returns a **unique** identifier for your encryption algorithm, which is used to configure the `ams.shade.identifier`. Avoid using "default" (which disables encryption) or "base64" (which refers to AMS’s built-in Base64 support). -- `decrypt(String content)`: Implements the decryption logic for converting encrypted values back to plaintext. - -Here is an example implementation: -```java -package com.example.shade; - -import org.apache.amoro.config.Configurations; -import org.apache.amoro.config.shade.ConfigShade; -import java.util.Base64; - -/** - * Custom Base64 decryption implementation for AMS. - */ -public class Base64CustomConfigShade implements ConfigShade { - @Override - public String getIdentifier() { - return "base64-custom"; // Use this identifier in shade.identifier - } - - @Override - public String decrypt(String content) { - return new String(Base64.getDecoder().decode(content)); - } -} -``` - -### Register the Custom Implementation -Create a file named `org.apache.amoro.config.shade.ConfigShade` under `resources/META-INF/services/` and add the fully qualified class name of your implementation: -```j -com.example.shade.Base64CustomConfigShade -``` -### Build the JAR -Package your implementation into a JAR file using Maven: -```shell -mvn clean package -``` - -## Deploy the Custom Implementation -Once you've developed and packaged the custom encryption algorithm, you can deploy it to AMS by following these steps. - -### Copy the JAR to AMS -Move the generated JAR file to the `${AMORO_HOME}/lib/` directory. - -### Configure AMS to Use the Custom Encryption -Modify `${AMORO_HOME}/conf/config.yaml` to specify the custom encryption algorithm by setting the `ams.shade.identifier` to match the value returned by `getIdentifier()` in your Java class. -Then, replace sensitive configuration values with their encrypted versions. -```yaml -ams: - shade: - identifier: base64-custom # Use the custom encryption algorithm - sensitive-keywords: admin-password;database.password -``` - -### Restart AMS -Restart the AMS service to apply the new encryption settings. -```shell -bin/ams restart -``` - -By following these steps, you can successfully integrate and deploy a custom encryption algorithm in AMS, ensuring that sensitive information is securely stored in the configuration file. \ No newline at end of file diff --git a/amoro-docs/content/admin-guides/using-kyuubi.md b/amoro-docs/content/admin-guides/using-kyuubi.md deleted file mode 100644 index 44ccfde..0000000 --- a/amoro-docs/content/admin-guides/using-kyuubi.md +++ /dev/null @@ -1,64 +0,0 @@ ---- -title: "Using Kyuubi By Terminal" -url: using-kyuubi -aliases: - - "admin-guides/using-kyuubi" -menu: - main: - parent: Admin Guides - weight: 400 ---- - -# Using Kyuubi By Terminal -**Prerequisites**: -- There must be a running Kyuubi. To deploy and run Kyuubi, please refer to [Kyuubi doc](https://kyuubi.readthedocs.io/en/master/) - - -Terminal supports interfacing with Kyuubi to submit SQL to Kyuubi for execution. All you need to do is add the Kyuubi configuration as instructed below: -```shell -ams: - terminal: - backend: kyuubi - kyuubi.jdbc.url: jdbc:hive2://127.0.0.1:10009/ # kyuubi Connection Address -``` - -## Kerberos Authentication -Amoro terminal uses the Kerberos authentication information from the catalog to connect to Kyuubi. When configuring the Kyuubi JDBC URL, you only need to configure the connection information and do not need to configure Kerberos authentication information (e.g. principal). - -Without configuring Kyuubi, Terminal executes in memory in AMS. - -To execute SQL in Terminal, you can refer to the following steps:: - -- Please switch Catalog first -- Before writing SQL, you can use the provided SQL Shortcuts function to help you build SQL quickly. -- Writing SQL -- Click the Execute button to run the SQL; - -![terminal](../images/admin/terminal_introduce.png) - -## LDAP Authentication -Except for the configuration of Kerberos authentication, everything else is the same. You can integrate with LDAP using the following configuration: -set kyuubi.ldap.enabled to true, and then specify the username and password for LDAP in the URL. -```shell -ams: - terminal: - backend: kyuubi - kyuubi.ldap.enabled: true - kyuubi.jdbc.url: jdbc:hive2://127.0.0.1:10009/default;user=test;password=test # kyuubi Connection Address -``` - diff --git a/amoro-docs/content/concepts/catalogs.md b/amoro-docs/content/concepts/catalogs.md deleted file mode 100644 index e579957..0000000 --- a/amoro-docs/content/concepts/catalogs.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -title: "Catalogs" -url: catalogs -aliases: - - "concept/catalogs" -menu: - main: - parent: Concepts - weight: 100 ---- - -# Catalogs - -## Introduce multi-catalog - -A catalog is a metadata namespace that stores information about databases, tables, views, indexes, users, and UDFs. It provides a higher-level -namespace for `table` and `database`. Typically, a catalog is associated with a specific type of data source or cluster. In Flink, Spark and Trino, -the multi-catalog feature can be used to support SQL across data sources, such as: - -```SQL -SELECT c.ID, c.NAME, c.AGE, o.AMOUNT -FROM ${CATALOG_A}.ONLINE.CUSTOMERS c JOIN ${CATALOG_B}.OFFLINE.ORDERS o -ON (c.ID = o.CUSTOMER_ID) -``` - -In the past, data lakes were managed using the Hive Metastore (HMS) to handle metadata. Unfortunately, HMS does not support multi-catalog, which -limits the capabilities of engines on the data lake. For example, some users may want to use Spark to perform federated computation across different -Hive clusters by specifying the catalog name, requiring them to develop a Hive catalog plugin in the upper layer. Additionally, data lake formats are -moving from a single Hive-centric approach to a landscape of competing formats such as Iceberg, Delta, and Hudi. These new data lake formats are more -cloud-friendly and will facilitate the migration of data lakes to the cloud. In this context, a management system that supports multi-catalog is -needed to help users govern data lakes with different environments and formats. - -Users can create catalogs in Amoro for different environments, clusters, and table formats, and leverage the multi-catalog feature in Flink, Spark -and Trino to enable federated computation across multiple clusters and formats. Additionally, properties configured in catalogs can be shared by all -tables and users, avoiding duplication. By leveraging the multi-catalog design, Amoro provides support for a metadata center in data platforms. - -When AMS and HMS are used together, HMS serves as the storage foundation for AMS. With the [Iceberg Format](../iceberg-format/), users can leverage the -multi-catalog management functionality of AMS without introducing any Amoro dependencies. - -## How to use - -Amoro v0.4 introduced the catalog management feature, where table creation is performed under a catalog. Users can create, edit, and delete catalogs -in the catalogs module, which requires configuration of metastore, table format, and environment information upon creation. For more information, -please refer to the documentation: [Managing catalogs](../managing-catalogs/). - -## Future work - -AMS will focus on two goals to enhance the value of the metadata center in the future: - -- Expand data sources: In addition to data lakes, message queues, databases, and data warehouses can all be managed as objects in the catalog. -Through metadata center and SQL-based federated computing of the computing engine, AMS will provide infrastructure solutions for data platforms -such as DataOps and DataFabric -- Automatic catalog detection: In compute engines like Spark and Flink, it is possible to automatically detect the creation and changes of a -catalog, enabling a one-time configuration for permanent scalability. - diff --git a/amoro-docs/content/concepts/self-optimizing.md b/amoro-docs/content/concepts/self-optimizing.md deleted file mode 100644 index 285fade..0000000 --- a/amoro-docs/content/concepts/self-optimizing.md +++ /dev/null @@ -1,175 +0,0 @@ ---- -title: "Self-Optimizing" -url: self-optimizing -aliases: - - "concept/self-optimizing" -menu: - main: - parent: Concepts - weight: 200 ---- - -# Self-optimizing - -## Introduction - -Lakehouse is characterized by its openness and loose coupling, with data and files maintained by users through various engines. While this -architecture appears to be well-suited for T+1 scenarios, as more attention is paid to applying Lakehouse to streaming data warehouses and real-time -analysis scenarios, challenges arise. For example: - -- Streaming writes bring a massive amount of fragment files -- CDC ingestion and streaming updates generate excessive redundant data -- Using the new data lake format leads to orphan files and expired snapshots. - -These issues can significantly affect the performance and cost of data analysis. Therefore, Amoro has introduced a Self-optimizing mechanism to -create an out-of-the-box Streaming Lakehouse management service that is as user-friendly as a traditional database or data warehouse. The new table -format is used for this purpose. Self-optimizing involves various procedures such as file compaction, deduplication, and sorting. - -The architecture and working mechanism of Self-optimizing are shown in the figure below: - -![Self-optimizing architecture](../images/concepts/self-optimizing_arch.png) - -The Optimizer is a component responsible for executing Self-optimizing tasks. It is a resident process managed by AMS. AMS is responsible for -detecting and planning Self-optimizing tasks for tables, and then scheduling them to Optimizers for distributed execution in real-time. Finally, AMS -is responsible for submitting the optimizing results. Amoro achieves physical isolation of Optimizers through the Optimizer Group. - -The core features of Amoro's Self-optimizing are: - -- Automated, Asynchronous and Transparent — Continuous background detecting of file changes, asynchronous distributed execution of optimizing tasks, -transparent and imperceptible to users -- Resource Isolation and Sharing — Allow resources to be isolated and shared at the table level, as well as setting resource quotas -- Flexible and Scalable Deployment — Optimizers support various deployment methods and convenient scaling - -## Self-optimizing mechanism - -During the process of writing data, there may be two types of amplification: read amplification and write amplification: - -- Read amplification — If an excessive amount of fragment files are generated during the writing process, or if there is an excessive mapping of -delete and insert files (which may be a familiar issue for users of the Iceberg v2 format), and the optimizing cannot keep up with the rate of -fragment file generation, it can significantly degrade reading performance. -- Write amplification — Frequently scheduling optimizing can lead to frequent compaction and rewriting of existing files, which causes resource -competition and waste of CPU/IO/Memory, slows down the optimization speed, and further intensify read amplification. - -Frequent execution of optimizing is necessary to alleviate read amplification, but it can lead to write amplification. The design of self-optimizing -needs trade off between read and write amplification. Amoro's Self-optimizing takes inspiration from the Generational Garbage Collection algorithm -in the JVM. Files are divided into Fragments and Segments based on their sizes, and different Self-optimizing processes executed on Fragments and -Segments are classified into two types: minor and major. Therefore, Amoro v0.4 introduces two parameters to define Fragments and Segments: - -```SQL --- Target file Size for Self-optimizing -self-optimizing.target-size = 134217728(128MB) --- The fragment file size threshold for Self-optimizing -self-optimizing.fragment-ratio = 8 -``` - -`self-optimizing.target-size` defines the target output size for major optimizing, which is set to 128 MB by default. `self-optimizing.fragment-ratio` -defines the ratio of the fragment file threshold to the target-size, with a value of 8 indicating that the default fragment threshold is 1/8 of the -target-size, or 16 MB for a default target-size of 128 MB. Files smaller than 16 MB are considered fragments, while files larger than 16 MB are -considered segments, as shown in the diagram below: - -![Minor optimizing](../images/concepts/minor_optimizing.png) - -The goal of Minor optimizing is to alleviate read amplification issues, which entails two tasks: - -* Compact fragment files into segment files as quickly as possible. Minor optimizing will be executed more frequently when fragment files are rapidly -generated. -* Converting from a write-friendly file format to a read-friendly file format, which involves transitioning ChangeStore to BaseStore for the -Mixed Format, and eq-delete files to pos-delete files for the Iceberg Format. - -After executing Minor optimizing multiple times, there will be many Segment files in the tablespace. Although in most cases, the read efficiency -of Segment files can meet performance requirements, however: - -* There may be a significant amount of accumulated delete data on each Segment file -* There may be a lot of duplicate data on primary keys between different Segment files - -At this stage, the reading performance problem is no longer caused by the read amplification issue resulting from small file size and file format. -Instead, it is due to the presence of excessive redundant data that needs to be merged and cleaned up during merge-on-read. To address this problem, -Amoro introduces major optimizing which merges Segment files to clean up redundant data and control its amount to a level that is favorable to -reading. Minor optimizing has already performed multiple rounds of deduplication, and major optimizing is not scheduled frequently to avoid write -amplification issues. Additionally, Full optimizing merges all files in the target space into a single file, which is a special case of -major optimizing. - -![Major optimizing](../images/concepts/major_optimizing.png) - -The design of Major optimizing and Minor optimizing takes inspiration from the Generational Garbage Collection algorithm of JVM. The execution -logic of both optimizing is consistent, as they both involve file compaction, data deduplication, and conversion from write-friendly format to -read-friendly format. The input-output relationships of Minor, Major, and Full optimizing are shown in the following table: - -| Self-optimizing type | Input space | Output space | Input file types | Output file types | -|:----------|:----------|:----------|:----------|:----------| -| minor | fragment | fragment/segment | insert, eq-delete, pos-delete | insert, pos-delete | -| major | fragment, segment | segment | insert, eq-delete, pos-delete | insert, pos-delete | -| full | fragment, segment | segment | insert, eq-delete, pos-delete | insert | - - -## Self-optimizing scheduling policy - -AMS determines the Scheduling Policy to sequence the self-optimization process for different tables. The actual resources allocated for -Self-optimizing for each table are determined based on the chosen Scheduling Policy. Quota is used by Amoro to define the expected resource usage for -each table, while Quota occupation represents the percentage of actual resource usage compared to the expected usage. The AMS page allows viewing of -the Quota and Quota occupation for each table's Self-optimizing: - -![quota_and_occupation](../images/concepts/quota-occupation.png) - -Different optimizer groups can be configured with different scheduling policies to meet various optimization requirements. -See: [Optimizer Group Configuration](../managing-optimizers#optimizer-group)。 - -Users can also disable the Self-optimizing for a table by configuring the following settings on the table, which will prevent it from being scheduled -for optimizing. - -```SQL -self-optimizing.enabled = false; -``` - -If you are working with non-updatable tables like logs or sensor data and are used to utilizing the Spark Rewrite Action offered by Iceberg, you can -turn off the Self-optimizing. - -However, if the table is configured with a primary key and supports CDC ingestion and streaming updates (e.g., database synchronization tables or -dimensionally aggregated tables), it is advisable to enable the Self-optimizing. - -Currently, there are two main scheduling policies available: `Quota` and `Balanced`. - -### Quota - -The `Quota` strategy is a scheduling policy that schedules based on resource usage. The Self-optimizing resource usage of a single table is managed -by configuring the quota configuration on the table: - -```SQL --- Quota for Self-optimizing, indicating the CPU resource the table can take up -self-optimizing.quota = 0.1; -``` -Quota defines the maximum CPU usage that a single table can use, but Self-optimizing is actually executed in a distributed manner, and actual resource -usage is dynamically managed based on actual execution time.In the optimizing management Web UI, the dynamic quota usage of a single table can be -viewed through the "Quota Occupy" metric. From a design perspective, the quota occupy metric should dynamically approach 100%. - -In a platform, two situations may occur: overselling and overbuying. - -- Overselling — If all optimizer configurations exceed the total quota of all table configurations, the quota occupy metric may dynamically approach -above 100% -- Overbuying — If all optimizer configurations are lower than the total quota of all table configurations, the quota occupy metric should dynamically -approach below 100% - -### Balanced - -The `Balanced` strategy is a scheduling strategy based on time progression, where tables that have not been Self-optimized for a longer time have a -higher scheduling priority. This strategy aims to keep the self-optimizing progress of each table at a similar level, which can avoid the situation -where tables with high resource consumption do not perform Self-optimizing for a long time, thus affecting the overall query efficiency in the quota -scheduling strategy. - -If there is no special requirement for resource usage among the tables in an optimizer group, and all tables are expected to have decent query -efficiency, then the `Balanced` strategy is a good choice. \ No newline at end of file diff --git a/amoro-docs/content/concepts/table-watermark.md b/amoro-docs/content/concepts/table-watermark.md deleted file mode 100644 index 899e4da..0000000 --- a/amoro-docs/content/concepts/table-watermark.md +++ /dev/null @@ -1,77 +0,0 @@ ---- -title: "Table Watermark" -url: table-watermark -aliases: - - "concept/table-watermark" -menu: - main: - parent: Concepts - weight: 400 ---- - -# Table Watermark - -## Table freshness - -Data freshness represents timeliness, and in many discussions, freshness is considered one of the important indicators of data quality. In traditional -offline data warehouses, higher cost typically means better performance, creating a typical binary paradox in terms of cost-performance trade-off. -However, in high-freshness streaming data warehouses, massive small files and frequent updates can lead to performance degradation. The higher the -freshness, the greater the impact on performance. To achieve the required performance, users must incur higher costs. Thus, for streaming data -warehouses, data freshness, query performance, and cost form a tripartite paradox. - -Freshness, cost and performance - -Amoro offers a resolution to the tripartite paradox for users by utilizing AMS management functionality and a self-optimizing mechanism. Unlike -traditional data warehouses, Lakehouse tables are utilized in a multitude of data pipelines, AI, and BI scenarios. Measuring data freshness is -crucially important for data developers, analysts, and administrators, and Amoro addresses this challenge by adopting the watermark concept in stream -computing to gauge table freshness. - -## Table watermark - -In the Mixed Format, data freshness is measured through table watermark. - -Strictly speaking, table watermark is used to describe the writing progress of a table. Specifically, it is a timestamp attribute on the table that -indicates that data with timestamps earlier than this watermark have been written to the table. It is typically used to monitor the progress of table -writes and can also serve as a trigger indicator for downstream batch computing tasks. - -Mixed Format uses the following configurations to configure watermark: - -```sql - 'table.event-time-field' = 'op_time', - 'table.watermark-allowed-lateness-second' = '60' -``` - -In the example above, `op_time` is set as the event time field for the table, and the watermark for the table is calculated using the `op_time` of the -data being written. To handle out-of-order writes, a maximum delay of 60 seconds is allowed for calculating the watermark. Unlike in stream -processing, data with event_time values smaller than the watermark will not be rejected, but they will not affect the advancement of the watermark -either. - -You can view the current watermark of a table in the AMS Dashboard's table details, or you can use the following SQL query in the terminal to query -the watermark of a table: - -```SQL -SHOW TBLPROPERTIES test_db.test_log_store ('watermark.table'); -``` -You can also query the table watermark of the BaseStore using the following command, which can be combined with native reads from Hive or Iceberg for -greater flexibility: - -```SQL -SHOW TBLPROPERTIES test_db.test_log_store ('watermark.base'); -``` - -You can learn about how to use Watermark in detail by referring to [Using tables](../using-tables/). \ No newline at end of file diff --git a/amoro-docs/content/engines/flink/flink-cdc-ingestion.md b/amoro-docs/content/engines/flink/flink-cdc-ingestion.md deleted file mode 100644 index c79e311..0000000 --- a/amoro-docs/content/engines/flink/flink-cdc-ingestion.md +++ /dev/null @@ -1,547 +0,0 @@ ---- -title: "Flink CDC Ingestion" -url: flink-cdc-ingestion -aliases: - - "flink/cdc-ingestion" -menu: - main: - parent: Flink - weight: 400 ---- - -# Apache CDC Ingestion -CDC stands for Change Data Capture, which is a broad concept, as long as it can capture the change data, it can be called CDC. [Flink CDC](https://github.com/apache/flink-cdc) is a Log message-based data capture tool, all the inventory and incremental data can be captured. Taking MySQL as an example, it can easily capture Binlog data through Debezium and process the calculations in real time to send them to the data lake. The data lake can then be queried by other engines. - -This section will show how to ingest one table or multiple tables into the data lake for both [Iceberg](../iceberg-format/) format and [Mixed-Iceberg](../mixed-iceberg-format/) format. -## Ingest into one table -### Iceberg format -The following example will show how [MySQL CDC](https://nightlies.apache.org/flink/flink-cdc-docs-release-3.1/docs/) data is written to an Iceberg table. - -**Requirements** - -Please add [Flink SQL Connector MySQL CDC](https://mvnrepository.com/artifact/org.apache.flink/flink-connector-mysql-cdc) and [Iceberg](https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink-1.18/1.6.0/iceberg-flink-1.18-1.6.0.jar) Jars to the lib directory of the Flink engine package. - -```sql -CREATE TABLE products ( - id INT, - name STRING, - description STRING, - PRIMARY KEY (id) NOT ENFORCED -) WITH ( - 'connector' = 'mysql-cdc', - 'hostname' = 'localhost', - 'port' = '3306', - 'username' = 'root', - 'password' = '123456', - 'database-name' = 'mydb', - 'table-name' = 'products' -); - -CREATE CATALOG iceberg_hadoop_catalog WITH ( - 'type'='iceberg', - 'catalog-type'='hadoop', - 'warehouse'='hdfs://nn:8020/warehouse/path', - 'property-version'='1' -); - -CREATE TABLE IF NOT EXISTS `iceberg_hadoop_catalog`.`default`.`sample` ( - id INT, - name STRING, - description STRING, - PRIMARY KEY (id) NOT ENFORCED -); - -INSERT INTO `iceberg_hadoop_catalog`.`default`.`sample` SELECT * FROM products; -``` - -### Mixed-Iceberg format -The following example will show how MySQL CDC data is written to a Mixed-Iceberg table. - -**Requirements** - -Please add [Flink SQL Connector MySQL CDC](https://mvnrepository.com/artifact/org.apache.flink/flink-connector-mysql-cdc/3.1.1) and [Amoro](../../../download/) Jars to the lib directory of the Flink engine package. - -```sql -CREATE TABLE products ( - id INT, - name STRING, - description STRING, - PRIMARY KEY (id) NOT ENFORCED -) WITH ( - 'connector' = 'mysql-cdc', - 'hostname' = 'localhost', - 'port' = '3306', - 'username' = 'root', - 'password' = '123456', - 'database-name' = 'mydb', - 'table-name' = 'products' -); - -CREATE CATALOG amoro_catalog WITH ( - 'type'='amoro', - 'metastore.url'='thrift://:/' -); - -CREATE TABLE IF NOT EXISTS `amoro_catalog`.`db`.`test_tb`( - id INT, - name STRING, - description STRING, - PRIMARY KEY (id) NOT ENFORCED -); - -INSERT INTO `amoro_catalog`.`db`.`test_tb` SELECT * FROM products; -``` - -## Ingest Into multiple tables -### Iceberg format -The following example will show how to write CDC data from multiple MySQL tables into the corresponding Iceberg table. - -**Requirements** - -Please add [Flink Connector MySQL CDC](https://mvnrepository.com/artifact/org.apache.flink/flink-connector-mysql-cdc/3.1.1) -and [Iceberg](https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-flink-1.18/1.6.0) dependencies to your -Maven project's pom.xml file. - -```java -import static java.util.stream.Collectors.toMap; -import static org.apache.flink.cdc.connectors.mysql.table.MySqlReadableMetadata.DATABASE_NAME; -import static org.apache.flink.cdc.connectors.mysql.table.MySqlReadableMetadata.TABLE_NAME; - -import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.tuple.Tuple2; -import org.apache.flink.cdc.connectors.mysql.source.MySqlSource; -import org.apache.flink.cdc.connectors.mysql.table.MySqlDeserializationConverterFactory; -import org.apache.flink.cdc.debezium.DebeziumDeserializationSchema; -import org.apache.flink.cdc.debezium.table.MetadataConverter; -import org.apache.flink.cdc.debezium.table.RowDataDebeziumDeserializeSchema; -import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.ProcessFunction; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.Schema; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.CatalogTable; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.ResolvedCatalogTable; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.conversion.RowRowConverter; -import org.apache.flink.table.data.utils.JoinedRowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.util.Collector; -import org.apache.flink.util.OutputTag; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.CatalogLoader; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.sink.FlinkSink; -import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.source.SourceRecord; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -public class MySqlCDC2IcebergExample { - public static void main(String[] args) throws Exception { - List> pathAndTable = initSourceTables(); - Map debeziumDeserializeSchemas = getDebeziumDeserializeSchemas(pathAndTable); - MySqlSource mySqlSource = MySqlSource.builder() - .hostname("yourHostname") - .port(yourPort) - .databaseList("test_db") - // setting up tables to be captured - .tableList("test_db.user", "test_db.product") - .username("yourUsername") - .password("yourPassword") - .deserializer(new CompositeDebeziumDeserializationSchema(debeziumDeserializeSchemas)) - .build(); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - - // enable checkpoint - env.enableCheckpointing(60000); - - // Split CDC streams by table name - SingleOutputStreamOperator process = env - .fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "MySQL Source") - .setParallelism(4) - .process(new SplitCdcStreamFunction(pathAndTable.stream() - .collect(toMap(e -> e.f0.toString(), - e -> RowRowConverter.create(e.f1.getResolvedSchema().toPhysicalRowDataType()))))) - .name("split stream"); - - // create Iceberg sink and insert into CDC data - Map properties = Maps.newHashMap(); - properties.put(CatalogProperties.WAREHOUSE_LOCATION, "yourWarehouseLocation"); - properties.put(CatalogProperties.URI, "yourThriftUri"); - CatalogLoader catalogLoader = CatalogLoader.hadoop("hadoop_catalog", new Configuration(), properties); - Catalog icebergHadoopCatalog = catalogLoader.loadCatalog(); - Map sinkTableSchemas = new HashMap<>(); - sinkTableSchemas.put("user", TableSchema.builder().field("id", DataTypes.INT()) - .field("name", DataTypes.STRING()).field("op_time", DataTypes.TIMESTAMP()).build()); - sinkTableSchemas.put("product", TableSchema.builder().field("productId", DataTypes.INT()) - .field("price", DataTypes.DECIMAL(12, 6)).field("saleCount", DataTypes.INT()).build()); - - for (Map.Entry entry : sinkTableSchemas.entrySet()) { - TableIdentifier identifier = TableIdentifier.of(Namespace.of("test_db"), entry.getKey()); - Table table = icebergHadoopCatalog.loadTable(identifier); - TableLoader tableLoader = TableLoader.fromCatalog(catalogLoader, identifier); - - FlinkSink.forRowData(process.getSideOutput(new OutputTag(entry.getKey()){})) - .tableLoader(tableLoader) - .table(table) - .append(); - } - - env.execute("Sync MySQL to the Iceberg table"); - } - - static class CompositeDebeziumDeserializationSchema - implements DebeziumDeserializationSchema { - - private final Map deserializationSchemaMap; - - public CompositeDebeziumDeserializationSchema( - final Map deserializationSchemaMap) { - this.deserializationSchemaMap = deserializationSchemaMap; - } - - @Override - public void deserialize(final SourceRecord record, final Collector out) - throws Exception { - final Struct value = (Struct) record.value(); - final Struct source = value.getStruct("source"); - final String db = source.getString("db"); - final String table = source.getString("table"); - if (deserializationSchemaMap == null) { - throw new IllegalStateException("deserializationSchemaMap can not be null!"); - } - deserializationSchemaMap.get(db + "." + table).deserialize(record, out); - } - - @Override - public TypeInformation getProducedType() { - return TypeInformation.of(RowData.class); - } - } - - static class SplitCdcStreamFunction extends ProcessFunction { - private final Map converters; - - public SplitCdcStreamFunction(final Map converterMap) { - this.converters = converterMap; - } - - @Override - public void processElement(final RowData rowData, - final ProcessFunction.Context ctx, final Collector out) - throws Exception { - // JoinedRowData like +I{row1=+I(1,2.340000,3), row2=+I(product,test_db)} - // so rowData.getArity() - 2 is the tableName field index - final String tableName = rowData.getString(rowData.getArity() - 2).toString(); - ctx.output(new OutputTag(tableName) {}, - getField(JoinedRowData.class, (JoinedRowData) rowData, "row1")); - } - - private static V getField(Class clazz, O obj, String fieldName) { - try { - java.lang.reflect.Field field = clazz.getDeclaredField(fieldName); - field.setAccessible(true); - Object v = field.get(obj); - return v == null ? null : (V) v; - } catch (NoSuchFieldException | IllegalAccessException e) { - throw new RuntimeException(e); - } - } - } - - private static List> initSourceTables() { - List> pathAndTable = new ArrayList<>(); - // build table "user" - Schema userSchema = Schema.newBuilder() - .column("id", DataTypes.INT().notNull()) - .column("name", DataTypes.STRING()) - .column("op_time", DataTypes.TIMESTAMP()) - .primaryKey("id") - .build(); - List userTableCols = Stream.of( - Column.physical("id", DataTypes.INT().notNull()), - Column.physical("name", DataTypes.STRING()), - Column.physical("op_time", DataTypes.TIMESTAMP())).collect(Collectors.toList()); - Schema.UnresolvedPrimaryKey userPrimaryKey = userSchema.getPrimaryKey().orElseThrow(() -> new RuntimeException("table user required pk ")); - ResolvedSchema userResolvedSchema = new ResolvedSchema(userTableCols, Collections.emptyList(), UniqueConstraint.primaryKey( - userPrimaryKey.getConstraintName(), userPrimaryKey.getColumnNames())); - ResolvedCatalogTable userTable = new ResolvedCatalogTable( - CatalogTable.of(userSchema, "", Collections.emptyList(), new HashMap<>()), userResolvedSchema); - pathAndTable.add(Tuple2.of(new ObjectPath("test_db", "user"), userTable)); - - // build table "product" - Schema productSchema = Schema.newBuilder() - .column("productId", DataTypes.INT().notNull()) - .column("price", DataTypes.DECIMAL(12, 6)) - .column("saleCount", DataTypes.INT()) - .primaryKey("productId") - .build(); - List productTableCols = Stream.of( - Column.physical("productId", DataTypes.INT().notNull()), - Column.physical("price", DataTypes.DECIMAL(12, 6)), - Column.physical("saleCount", DataTypes.INT())).collect(Collectors.toList()); - Schema.UnresolvedPrimaryKey productPrimaryKey = productSchema.getPrimaryKey().orElseThrow(() -> new RuntimeException("table product required pk ")); - ResolvedSchema productResolvedSchema = new ResolvedSchema(productTableCols, Collections.emptyList(), UniqueConstraint.primaryKey( - productPrimaryKey.getConstraintName(), productPrimaryKey.getColumnNames())); - ResolvedCatalogTable productTable = new ResolvedCatalogTable( - CatalogTable.of(productSchema, "", Collections.emptyList(), new HashMap<>()), productResolvedSchema); - pathAndTable.add(Tuple2.of(new ObjectPath("test_db", "product"), productTable)); - return pathAndTable; - } - - private static Map getDebeziumDeserializeSchemas( - final List> pathAndTable) { - return pathAndTable.stream() - .collect(toMap(e -> e.f0.toString(), e -> RowDataDebeziumDeserializeSchema.newBuilder() - .setPhysicalRowType( - (RowType) e.f1.getResolvedSchema().toPhysicalRowDataType().getLogicalType()) - .setUserDefinedConverterFactory(MySqlDeserializationConverterFactory.instance()) - .setMetadataConverters( - new MetadataConverter[] {TABLE_NAME.getConverter(), DATABASE_NAME.getConverter()}) - .setResultTypeInfo(TypeInformation.of(RowData.class)).build())); - } -} -``` - -### Mixed-Iceberg format -The following example will show how to write CDC data from multiple MySQL tables into the corresponding Mixed-Iceberg table. - -**Requirements** - -Please add [Flink Connector MySQL CDC](https://mvnrepository.com/artifact/org.apache.flink/flink-connector-mysql-cdc/3.1.1) and [Amoro](https://mvnrepository.com/artifact/org.apache.amoro/amoro-format-mixed-flink-1.17/0.7.0-incubating) dependencies to your Maven project's pom.xml file. - -```java -import org.apache.amoro.flink.InternalCatalogBuilder; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.flink.write.FlinkSink; -import org.apache.amoro.table.TableIdentifier; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.tuple.Tuple2; -import org.apache.flink.cdc.connectors.mysql.source.MySqlSource; -import org.apache.flink.cdc.connectors.mysql.table.MySqlDeserializationConverterFactory; -import org.apache.flink.cdc.debezium.DebeziumDeserializationSchema; -import org.apache.flink.cdc.debezium.table.MetadataConverter; -import org.apache.flink.cdc.debezium.table.RowDataDebeziumDeserializeSchema; -import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.ProcessFunction; -import org.apache.flink.table.api.*; -import org.apache.flink.table.catalog.*; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.conversion.RowRowConverter; -import org.apache.flink.table.data.utils.JoinedRowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.util.Collector; -import org.apache.flink.util.OutputTag; -import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.source.SourceRecord; - -import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import static java.util.stream.Collectors.toMap; -import static org.apache.flink.cdc.connectors.mysql.table.MySqlReadableMetadata.DATABASE_NAME; -import static org.apache.flink.cdc.connectors.mysql.table.MySqlReadableMetadata.TABLE_NAME; - -public class MySqlCDC2MixedIcebergExample { - public static void main(String[] args) throws Exception { - List> pathAndTable = initSourceTables(); - Map debeziumDeserializeSchemas = getDebeziumDeserializeSchemas( - pathAndTable); - MySqlSource mySqlSource = MySqlSource.builder() - .hostname("yourHostname") - .port(3306) - .databaseList("test_db") - // setting up tables to be captured - .tableList("test_db.user", "test_db.product") - .username("yourUsername") - .password("yourPassword") - .deserializer(new CompositeDebeziumDeserializationSchema(debeziumDeserializeSchemas)) - .build(); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - - // enable checkpoint - env.enableCheckpointing(60000); - - // Split CDC streams by table name - SingleOutputStreamOperator process = env - .fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "MySQL Source").setParallelism(4) - .process(new SplitCdcStreamFunction(pathAndTable.stream() - .collect(toMap(e -> e.f0.toString(), - e -> RowRowConverter.create(e.f1.getResolvedSchema().toPhysicalRowDataType()))))) - .name("split stream"); - - // create Amoro sink and insert into cdc data - InternalCatalogBuilder catalogBuilder = InternalCatalogBuilder.builder().metastoreUrl( - "thrift://:/"); - Map sinkTableSchemas = new HashMap<>(); - sinkTableSchemas.put("user", TableSchema.builder().field("id", DataTypes.INT()) - .field("name", DataTypes.STRING()).field("op_time", DataTypes.TIMESTAMP()).build()); - sinkTableSchemas.put("product", TableSchema.builder().field("productId", DataTypes.INT()) - .field("price", DataTypes.DECIMAL(12, 6)).field("saleCount", DataTypes.INT()).build()); - - for (Map.Entry entry : sinkTableSchemas.entrySet()) { - TableIdentifier tableId = - TableIdentifier.of("yourCatalogName", "yourDatabaseName", entry.getKey()); - MixedFormatTableLoader tableLoader = MixedFormatTableLoader.of(tableId, catalogBuilder); - - FlinkSink.forRowData(process.getSideOutput(new OutputTag(entry.getKey()) { - })) - .flinkSchema(entry.getValue()) - .table(MixedFormatUtils.loadMixedTable(tableLoader)) - .tableLoader(tableLoader).build(); - } - - env.execute("Sync MySQL to Mixed-Iceberg table"); - } - - static class CompositeDebeziumDeserializationSchema - implements DebeziumDeserializationSchema { - - private final Map deserializationSchemaMap; - - public CompositeDebeziumDeserializationSchema( - final Map deserializationSchemaMap) { - this.deserializationSchemaMap = deserializationSchemaMap; - } - - @Override - public void deserialize(final SourceRecord record, final Collector out) - throws Exception { - final Struct value = (Struct) record.value(); - final Struct source = value.getStruct("source"); - final String db = source.getString("db"); - final String table = source.getString("table"); - if (deserializationSchemaMap == null) { - throw new IllegalStateException("deserializationSchemaMap can not be null!"); - } - deserializationSchemaMap.get(db + "." + table).deserialize(record, out); - } - - @Override - public TypeInformation getProducedType() { - return TypeInformation.of(RowData.class); - } - } - - static class SplitCdcStreamFunction extends ProcessFunction { - private final Map converters; - - public SplitCdcStreamFunction(final Map converterMap) { - this.converters = converterMap; - } - - @Override - public void processElement(final RowData rowData, - final ProcessFunction.Context ctx, final Collector out) - throws Exception { - // JoinedRowData like +I{row1=+I(1,2.340000,3), row2=+I(product,test_db)} - // so rowData.getArity() - 2 is the tableName field index - final String tableName = rowData.getString(rowData.getArity() - 2).toString(); - ctx.output(new OutputTag(tableName) { - }, - getField(JoinedRowData.class, (JoinedRowData) rowData, "row1")); - } - - private static V getField(Class clazz, O obj, String fieldName) { - try { - java.lang.reflect.Field field = clazz.getDeclaredField(fieldName); - field.setAccessible(true); - Object v = field.get(obj); - return v == null ? null : (V) v; - } catch (NoSuchFieldException | IllegalAccessException e) { - throw new RuntimeException(e); - } - } - } - - private static List> initSourceTables() { - List> pathAndTable = new ArrayList<>(); - // build table "user" - Schema userSchema = Schema.newBuilder() - .column("id", DataTypes.INT().notNull()) - .column("name", DataTypes.STRING()) - .column("op_time", DataTypes.TIMESTAMP()) - .primaryKey("id") - .build(); - List userTableCols = Stream.of( - Column.physical("id", DataTypes.INT().notNull()), - Column.physical("name", DataTypes.STRING()), - Column.physical("op_time", DataTypes.TIMESTAMP())).collect(Collectors.toList()); - Schema.UnresolvedPrimaryKey userPrimaryKey = userSchema.getPrimaryKey().orElseThrow(() -> new RuntimeException("table user required pk ")); - ResolvedSchema userResolvedSchema = new ResolvedSchema(userTableCols, Collections.emptyList(), UniqueConstraint.primaryKey( - userPrimaryKey.getConstraintName(), userPrimaryKey.getColumnNames())); - ResolvedCatalogTable userTable = new ResolvedCatalogTable( - CatalogTable.of(userSchema, "", Collections.emptyList(), new HashMap<>()), userResolvedSchema); - pathAndTable.add(Tuple2.of(new ObjectPath("test_db", "user"), userTable)); - - // build table "product" - Schema productSchema = Schema.newBuilder() - .column("productId", DataTypes.INT().notNull()) - .column("price", DataTypes.DECIMAL(12, 6)) - .column("saleCount", DataTypes.INT()) - .primaryKey("productId") - .build(); - List productTableCols = Stream.of( - Column.physical("productId", DataTypes.INT().notNull()), - Column.physical("price", DataTypes.DECIMAL(12, 6)), - Column.physical("saleCount", DataTypes.INT())).collect(Collectors.toList()); - Schema.UnresolvedPrimaryKey productPrimaryKey = productSchema.getPrimaryKey().orElseThrow(() -> new RuntimeException("table product required pk ")); - ResolvedSchema productResolvedSchema = new ResolvedSchema(productTableCols, Collections.emptyList(), UniqueConstraint.primaryKey( - productPrimaryKey.getConstraintName(), productPrimaryKey.getColumnNames())); - ResolvedCatalogTable productTable = new ResolvedCatalogTable( - CatalogTable.of(productSchema, "", Collections.emptyList(), new HashMap<>()), productResolvedSchema); - pathAndTable.add(Tuple2.of(new ObjectPath("test_db", "product"), productTable)); - return pathAndTable; - } - - private static Map getDebeziumDeserializeSchemas( - final List> pathAndTable) { - return pathAndTable.stream() - .collect(toMap(e -> e.f0.toString(), e -> RowDataDebeziumDeserializeSchema.newBuilder() - .setPhysicalRowType( - (RowType) e.f1.getResolvedSchema().toPhysicalRowDataType().getLogicalType()) - .setUserDefinedConverterFactory(MySqlDeserializationConverterFactory.instance()) - .setMetadataConverters( - new MetadataConverter[]{TABLE_NAME.getConverter(), DATABASE_NAME.getConverter()}) - .setResultTypeInfo(TypeInformation.of(RowData.class)).build())); - } -} -``` \ No newline at end of file diff --git a/amoro-docs/content/engines/flink/flink-ddl.md b/amoro-docs/content/engines/flink/flink-ddl.md deleted file mode 100644 index 3978e41..0000000 --- a/amoro-docs/content/engines/flink/flink-ddl.md +++ /dev/null @@ -1,246 +0,0 @@ ---- -title: "Flink DDL" -url: flink-ddl -aliases: - - "flink/ddl" -menu: - main: - parent: Flink - weight: 200 ---- - -# Flink DDL - -## Create catalogs - -### Flink SQL -The following statement can be executed to create a Flink catalog: - -```sql -CREATE CATALOG WITH ( - 'type'='mixed_iceberg', - ``=`` -); -``` - -Where `` is the user-defined name of the Flink catalog, and ``=`` has the following configurations: - -| Key | Default Value | Type | Required | Description | -|------------------|---------------|---------|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| type | N/A | String | Yes | Catalog type, validate values are mixed_iceberg and mixed_hive | -| metastore.url | (none) | String | No | The URL for Amoro Metastore is thrift://``:``/``.
If high availability is enabled for AMS, it can also be specified in the form of zookeeper://{zookeeper-server}/{cluster-name}/{catalog-name}. | -| default-database | default | String | No | The default database to use | -| property-version | 1 | Integer | No | Catalog properties version, this option is for future backward compatibility | -| catalog-type | N/A | String | No | Metastore type of the catalog, validate values are hadoop, hive, rest, custom | - -The authentication information of AMS catalog can upload configuration files on AMS website, -or specify the authentication information and configuration file paths when creating catalogs with Flink DDL - -| Key | Default Value | Type | Required | Description | -|----------------------------------------|---------------|---------|----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| properties.auth.load-from-ams | True | BOOLEAN | No | Whether to load security verification configuration from AMS.
True: load from AMS;
false: do not use AMS configuration.
Note: regardless of whether this parameter is configured, as long as the user has configured the auth.*** related parameters below, this configuration will be used for access. | -| properties.auth.type | (none) | String | No | Table security verification type, valid values: simple, kerberos, or not configured. Default not configured, no permission check is required. simple: use the hadoop username, used in conjunction with the parameter 'properties.auth.simple.hadoop_username'; kerberos: configure kerberos permission verification, used in conjunction with the parameters 'properties.auth.kerberos.principal', 'properties.auth.kerberos.keytab', 'properties.auth.kerberos.krb' | -| properties.auth.simple.hadoop_username | (none) | String | No | Access using this hadoop username, required when 'properties.auth.type'='simple'. | -| properties.auth.kerberos.principal | (none) | String | No | Configuration of kerberos principal, required when 'properties.auth.type'='kerberos'. | -| properties.auth.kerberos.krb.path | (none) | String | No | The absolute path to the krb5.conf configuration file for kerberos (the local file path of the Flink SQL submission machine, if the SQL task is submitted with the Flink SQL Client, the path is the local path of the same node, e.g. /XXX/XXX/krb5.conf).' required if 'properties.auth.type' = 'kerberos'. | -| properties.auth.kerberos.keytab.path | (none) | String | No | The absolute path to the XXX.keytab configuration file for kerberos (the local file path of the Flink SQL submission machine, if the SQL task is submitted with the Flink SQL Client, the path is the local path of the same node, e.g. /XXX/XXX/XXX.keytab).' required if 'properties.auth.type' = 'kerberos'. | - - -### YAML configuration -Refer to the Flink SQL Client [official configuration](https://nightlies.apache.org/flink/flink-docs-release-1.12/dev/table/sqlClient.html#environment-files). -Modify the `conf/sql-client-defaults.yaml` file in the Flink directory. -```yaml -catalogs: -- name: - type: mixed_iceberg - metastore.url: ... - ... -``` - -## CREATE statement - -### CREATE DATABASE -By default, the default-database configuration (default value: default) when creating catalog is used. You can create a database using the following example: - -```sql -CREATE DATABASE [catalog_name.]mixed_db; - -USE mixed_db; -``` - -### CREATE TABLE - -```sql -CREATE TABLE `mixed_catalog`.`mixed_db`.`test_table` ( - id BIGINT, - name STRING, - op_time TIMESTAMP, - ts3 AS CAST(op_time as TIMESTAMP(3)), - watermark FOR ts3 AS ts3 - INTERVAL '5' SECOND, - proc AS PROCTIME(), - PRIMARY KEY (id) NOT ENFORCED -) WITH ( - 'key' = 'value' -); -``` - -Currently, most of the syntax supported by [Flink SQL create table](https://nightlies.apache.org/flink/flink-docs-release-1.12/dev/table/sql/create.html#create-table) is supported, including: - -- PARTITION BY (column1, column2, …): configure Flink partition fields, but Flink does not yet support hidden partitions. -- PRIMARY KEY (column1, column2, …): configure primary keys. -- WITH ('key'='value', …): configure Amoro Table properties. -- computed_column_definition: column_name AS computed_column_expression. Currently, compute column must be listed after all physical columns. -- watermark_definition: WATERMARK FOR rowtime_column_name AS watermark_strategy_expression, rowtime_column_name must be of type TIMESTAMP(3). - -#### PARTITIONED BY -Create a partitioned table using PARTITIONED BY. -```sql -CREATE TABLE `mixed_catalog`.`new`.`test_table` ( - id BIGINT, - name STRING, - op_time TIMESTAMP -) PARTITIONED BY(op_time) WITH ( - 'key' = 'value' -); -``` -Amoro tables support hidden partitions, but Flink does not support function-based partitions. Therefore, currently only partitions with the same value can be created through Flink SQL. - -Alternatively, tables can be created without creating a Flink catalog: -```sql -CREATE TABLE `test_table` ( - id BIGINT, - name STRING, - op_time TIMESTAMP, - proc as PROCTIME(), - PRIMARY KEY (id) NOT ENFORCED -) WITH ( - 'connector' = 'mixed-format', - 'metastore.url' = '', - 'mixed_format.catalog' = '', - 'mixed_format.database' = '', - 'mixed_format.table' = '' -); -``` -where `` is the URL of the Amoro Metastore, and `mixed_format.catalog`, `mixed_format.database` and `mixed_format.table` are the catalog name, database name and table name of this table under the AMS, respectively. - -### CREATE TABLE LIKE -Create a table with the same table structure, partitions, and table properties as an existing table. This can be achieved using CREATE TABLE LIKE. - -```sql -CREATE TABLE `mixed_catalog`.`mixed_db`.`test_table` ( - id BIGINT, - name STRING, - op_time TIMESTAMP -); - -CREATE TABLE `mixed_catalog`.`mixed_db`.`test_table_like` - LIKE `mixed_catalog`.`mixed_db`.`test_table`; -``` -Further details can be found in [Flink create table like](https://nightlies.apache.org/flink/flink-docs-release-1.12/dev/table/sql/create.html#like) - -## DROP statement - -### DROP DATABASE - -```sql -DROP DATABASE catalog_name.mixed_db -``` - -### DROP TABLE -```sql -DROP TABLE `mixed_catalog`.`mixed_db`.`test_table`; -``` - -## SHOW statement - -### SHOW DATABASES -View all database names under the current catalog: -```sql -SHOW DATABASES; -``` - -### SHOW TABLES -View all table names in the current database: -```sql -SHOW TABLES; -``` - -### SHOW CREATE TABLE -View table details: -```sql -SHOW CREATE TABLE; -``` - -## DESC statement -View table description: -```sql -DESC TABLE; -``` - -## ALTER statement -Not supported at the moment - -## Supported types - -### Mixed-Hive data types - -| Flink Data Type | Hive Data Type | -|-----------------|----------------| -| STRING | CHAR(p) | -| STRING | VARCHAR(p) | -| STRING | STRING | -| BOOLEAN | BOOLEAN | -| INT | TINYINT | -| INT | SMALLINT | -| INT | INT | -| BIGINT | BIGINT | -| FLOAT | FLOAT | -| DOUBLE | DOUBLE | -| DECIMAL(p, s) | DECIMAL(p, s) | -| DATE | DATE | -| TIMESTAMP(6) | TIMESTAMP | -| VARBINARY | BINARY | -| ARRAY | ARRAY | -| MAP | MAP | -| ROW | STRUCT | - - -### mixed_iceberg data types -| Flink Data Type | Mixed-Iceberg Data Type | -|-----------------------------------|-------------------------| -| CHAR(p) | STRING | -| VARCHAR(p) | STRING | -| STRING | STRING | -| BOOLEAN | BOOLEAN | -| TINYINT | INT | -| SMALLINT | INT | -| INT | INT | -| BIGINT | LONG | -| FLOAT | FLOAT | -| DOUBLE | DOUBLE | -| DECIMAL(p, s) | DECIMAL(p, s) | -| DATE | DATE | -| TIMESTAMP(6) | TIMESTAMP | -| TIMESTAMP(6) WITH LOCAL TIME ZONE | TIMESTAMPTZ | -| BINARY(p) | FIXED(p) | -| BINARY(16) | UUID | -| VARBINARY | BINARY | -| ARRAY | ARRAY | -| MAP | MAP | -| ROW | STRUCT | -| MULTISET | MAP | \ No newline at end of file diff --git a/amoro-docs/content/engines/flink/flink-dml.md b/amoro-docs/content/engines/flink/flink-dml.md deleted file mode 100644 index bdc9f7f..0000000 --- a/amoro-docs/content/engines/flink/flink-dml.md +++ /dev/null @@ -1,264 +0,0 @@ ---- -title: "Flink DML" -url: flink-dml -aliases: - - "flink/dml" -menu: - main: - parent: Flink - weight: 300 ---- - -# Flink DML - -## Querying with SQL -Amoro tables support reading data in stream or batch mode through Flink SQL. You can switch modes using the following methods: -```sql --- Run Flink tasks in streaming mode in the current session -SET execution.runtime-mode = streaming; - --- Run Flink tasks in batch mode in the current session -SET execution.runtime-mode = batch; -``` - -The following Hint Options are supported: - -| Key | Default Value | Type | Required | Description | -|--------------------|---------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| source.parallelism | (none) | Integer | No | Defines a custom parallelism for the source. By default, if this option is not defined, the planner will derive the parallelism for each statement individually by also considering the global configuration. | - -### Batch mode -Use batch mode to read full and incremental data from FileStore. - -> **TIPS** -> -> LogStore does not support bounded reading. - -```sql --- Run Flink tasks in batch mode in the current session -SET execution.runtime-mode = batch; - --- Enable dynamic table parameter configuration to make hint options configured in Flink SQL effective -SET table.dynamic-table-options.enabled=true; -``` - -### Batch mode (non-primary key table) - -Non-primary key tables support reading full data in batch mode, specifying snapshot data with snapshot-id or timestamp, and specifying the incremental data of the snapshot interval. - -```sql --- Read full data -SELECT * FROM unkeyed /*+ OPTIONS('streaming'='false')*/; - --- Read specified snapshot data -SELECT * FROM unkeyed /*+ OPTIONS('snapshot-id'='4411985347497777546')*/; -``` -The supported parameters for bounded reads of non-primary-key tables in BaseStore include: - -| Key | Default Value | Type | Required | Description | -|-----------------------------|---------------|--------|----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| snapshot-id | (none) | Long | No | Reading the full data of a specific snapshot only works when streaming is set to false. | -| as-of-timestamp | (none) | Long | No | Reading the full data of the latest snapshot taken before the specified timestamp only works when streaming is set to false. | -| start-snapshot-id | (none) | Long | No | When streaming is set to false, you need to specify the end-snapshot-id to read the incremental data within two intervals (snapshot1, snapshot2]. When streaming is set to true, you can read the incremental data after the specified snapshot. If not specified, it will read the incremental data after the current snapshot (excluding the current one). | -| end-snapshot-id | (none) | Long | No | When streaming is set to false, you need to specify the start-snapshot-id to read the incremental data within two intervals (snapshot1, snapshot2]. | -| other table parameters | (none) | String | No | All parameters of an Amoro table can be dynamically modified through SQL Hint, but only for the current task. For a list of specific parameters, please refer to [Table Configurations](../configurations/). For permissions-related configurations on the catalog, they can also be configured in Hint using parameters such as [properties.auth.XXX in catalog DDL](../flink-ddl/#flink-sql) | - -### Batch mode (primary key table) -```sql --- Merge on Read the current mixed-format table and return append-only data. -SELECT * FROM keyed /*+ OPTIONS('streaming'='false', 'scan.startup.mode'='earliest')*/; -``` - -### Streaming mode -Amoro supports reading incremental data from FileStore or LogStore in streaming mode. - -### Streaming mode (LogStore) - -```sql --- Run Flink tasks in streaming mode in the current session -SET execution.runtime-mode = streaming; - --- Enable dynamic table parameter configuration to make hint options configured in Flink SQL effective -SET table.dynamic-table-options.enabled=true; - -SELECT * FROM test_table /*+ OPTIONS('mixed-format.read.mode'='log') */; -``` -The following Hint Options are supported: - -| Key | Default Value | Type | Required | Description | -|------------------------------------|---------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| mixed-format.read.mode | file | String | No | To specify the type of data to read from an Amoro table, either File or Log, use the mixed-format.read.mode parameter. If the value is set to log, the Log configuration must be enabled. | -| scan.startup.mode | latest | String | No | The valid values are 'earliest', 'latest', 'timestamp', 'group-offsets' and 'specific-offsets'.
'earliest' reads the data from the earliest offset possible.
'latest' reads the data from the latest offset.
'timestamp' reads from a specified time position, which requires configuring the 'scan.startup.timestamp-millis' parameter.
'group-offsets' reads the data from committed offsets in ZK / Kafka brokers of a specific consumer group.
'specific-offsets' read the data from user-supplied specific offsets for each partition, which requires configuring the 'scan.startup.specific-offsets' parameter. | -| scan.startup.timestamp-millis | (none) | Long | No | Valid when 'scan.startup.mode' = 'timestamp', reads data from the specified Kafka time with a millisecond timestamp starting at 00:00:00.000 GMT on 1 Jan 1970 | -| scan.startup.specific-offsets | (none) | String | No | specify offsets for each partition in case of 'specific-offsets' startup mode, e.g. 'partition:0,offset:42;partition:1,offset:300'. | -| properties.group.id | (none) | String | If the LogStore for an Amoro table is Kafka, it is mandatory to provide its details while querying the table. Otherwise, it can be left empty. | The group id used to read the Kafka Topic | -| properties.pulsar.admin.adminUrl | (none) | String | Required if LogStore is pulsar, otherwise not required | Pulsar admin's HTTP URL, e.g. http://my-broker.example.com:8080 | -| properties.* | (none) | String | No | Parameters for Logstore:
For Logstore with Kafka ('log-store.type'='kafka' default value), all other parameters supported by the Kafka Consumer can be set by prefixing properties. to the parameter name, for example, 'properties.batch.size'='16384'. The complete parameter information can be found in the [Kafka official documentation](https://kafka.apache.org/documentation/#consumerconfigs);
For LogStore set to Pulsar ('log-store.type'='pulsar'), all relevant configurations supported by Pulsar can be set by prefixing properties. to the parameter name, for example: 'properties.pulsar.client.requestTimeoutMs'='60000'. For complete parameter information, refer to the [Flink-Pulsar-Connector documentation](https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/connectors/datastream/pulsar) | -| log.consumer.changelog.modes | all-kinds | String | No | The type of RowKind that will be generated when reading log data, supports: all-kinds, append-only.
all-kinds: will read cdc data, including +I/-D/-U/+U;
append-only: will only generate Insert data, recommended to use this configuration when reading without primary key. | - -> **Notes** -> -> - When log-store.type = pulsar, the parallelism of the Flink task cannot be less than the number of partitions in the Pulsar topic, otherwise some partition data cannot be read. -> - When the number of topic partitions in log-store is less than the parallelism of the Flink task, some Flink subtasks will be idle. At this time, if the task has a watermark, the parameter table.exec.source.idle-timeout must be configured, otherwise the watermark will not advance. See [official documentation](https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/dev/table/config/#table-exec-source-idle-timeout) for details. - - -### Streaming mode (FileStore non-primary key table) - -```sql --- Run Flink tasks in streaming mode in the current session -SET execution.runtime-mode = streaming; - --- Enable dynamic table parameter configuration to make hint options configured in Flink SQL effective -SET table.dynamic-table-options.enabled = true; - --- Read incremental data after the current snapshot. -SELECT * FROM unkeyed /*+ OPTIONS('monitor-interval'='1s')*/ ; -``` -Hint Options - -| Key | Default Value | Type | Required | Description | -|----------------------------------|---------------|----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| streaming | true | Boolean | No | Reads bounded data or unbounded data in a streaming mode, false: reads bounded data, true: reads unbounded data | -| mixed-format.read.mode | file | String | No | To specify the type of data to read from an Amoro table, either File or Log, use the mixed-format.read.mode parameter. If the value is set to log, the Log configuration must be enabled. | -| monitor-interval | 10s | Duration | No | The mixed-format.read.mode = file parameter needs to be set for this to take effect. The time interval for monitoring newly added data files | -| start-snapshot-id | (none) | Long | No | To read incremental data starting from a specified snapshot (excluding the data in the start-snapshot-id snapshot), specify the snapshot ID using the start-snapshot-id parameter. If not specified, the reader will start reading from the snapshot after the current one (excluding the data in the current snapshot). | -| other table parameters | (none) | String | No | All parameters of an Amoro table can be dynamically modified through SQL Hints, but they only take effect for this specific task. For the specific parameter list, please refer to the [Table Configuration](../configurations/). For permissions-related configurations on the catalog, they can also be configured in Hint using parameters such as [properties.auth.XXX in catalog DDL](../flink-ddl/#flink-sql) | - -### Streaming Mode (FileStore primary key table) - - -After using CDC (Change Data Capture) to ingest data into the lake, you can use the Flink engine to read both stock data and incremental data in the same task without restarting the task, and ensure consistent data reading. Amoro Source will save the file offset information in the Flink state. - -In this way, the task can continue to read data from the last read offset position, ensuring data consistency and being able to process newly arrived incremental data. -```sql --- Run Flink tasks in streaming mode in the current session -SET execution.runtime-mode = streaming; - --- Enable dynamic table parameter configuration to make hint options configured in Flink SQL effective -SET table.dynamic-table-options.enabled = true; - --- Incremental unified reading of BaseStore and ChangeStore -SELECT * FROM keyed /*+ OPTIONS('streaming'='true', 'scan.startup.mode'='earliest')*/; -``` - -Hint Options - -| Key | Default Value | Type | Required | Description | -|------------------------|---------------|--------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| streaming | true | String | No | Reads bounded data or unbounded data in a streaming mode, false: reads bounded data, true: reads unbounded data | -| mixed-format.read.mode | file | String | No | Specifies the data to read from an Amoro table, either file or log. If the value is "log", Log configuration must be enabled | -| monitor-interval | 10s | String | No | This parameter only takes effect when mixed-format.read.mode = file. It sets the time interval for monitoring newly added data files | -| scan.startup.mode | latest | String | No | The valid values are 'earliest', 'latest'. 'earliest' reads the full table data and will continue to read incremental data when streaming=true. 'latest' reads only the data after the current snapshot, not including the data in the current snapshot. | -| other table parameters | (none) | String | No | All parameters of an Amoro table can be dynamically modified through SQL Hints, but they only take effect for this specific task. For the specific parameter list, please refer to the [Table Configuration](../configurations/). For permissions-related configurations on the catalog, they can also be configured in Hint using parameters such as [properties.auth.XXX in catalog DDL](../flink-ddl/#flink-sql) | - -## Writing With SQL -Amoro tables support writing data to LogStore or FileStore using Flink SQL. -### INSERT OVERWRITE -Currently, INSERT OVERWRITE is only supported for non-primary key tables. It replaces the data in the table, and the overwrite operation is atomic. Partitions are dynamically generated from the query statement, and the data in these partitions will be fully replaced. - -INSERT OVERWRITE only allows running in Flink Batch mode. - -```sql -INSERT OVERWRITE unkeyed VALUES (1, 'a', '2022-07-01'); -``` - -```sql --- It is also possible to overwrite data for a specific partition: - -INSERT OVERWRITE `mixed_catalog`.`mixed_db`.`unkeyed` PARTITION(data='2022-07-01') SELECT 5, 'b'; -``` -For non-partitioned tables, INSERT OVERWRITE will overwrite the entire data in the table. -### INSERT INTO -For Amoro tables, it is possible to specify whether to write data to FileStore or LogStore. - -For Amoro primary key tables, writing to FileStore will also write CDC data to the ChangeStore. -```sql -INSERT INTO `mixed_catalog`.`mixed_db`.`test_table` - /*+ OPTIONS('mixed-format.emit.mode'='log,file') */ -SELECT id, name from `source`; -``` - -Hint Options - -| Key | Default Value | Type | Required | Description | -|--------------------------------------------------|---------------|----------|----------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| mixed-format.emit.mode | auto | String | No | Data writing modes currently supported are: file, log, and auto. For example: 'file' means data is only written to the Filestore. 'log' means data is only written to the Logstore. 'file,log' means data is written to both the Filestore and the Logstore. 'auto' means data is written only to the Filestore if the Logstore for the Amoro table is disabled. If the Logstore for the Amoro table is enabled, it means data is written to both the Filestore and the Logstore. It is recommended to use 'auto'. | -| mixed-format.emit.auto-write-to-logstore.watermark-gap | (none) | Duration | No | This feature is only enabled when 'mixed-format.emit.mode'='auto'. If the watermark of the Amoro writers is greater than the current system timestamp minus a specific value, the writers will also write data to the Logstore. The default setting is to enable the Logstore writer immediately after the job starts. The value for this feature must be greater than 0. | -| log.version | v1 | String | No | The log data format currently has only one version, so it can be left empty | -| sink.parallelism | (none) | String | No | The parallelism for writing to the Filestore and Logstore is determined separately. The parallelism for submitting the file operator is always 1. | -| write.distribution-mode | hash | String | No | The distribution modes for writing to the Amoro table include: none and hash. | -| write.distribution.hash-mode | auto | String | No | The hash strategy for writing to an Amoro table only takes effect when write.distribution-mode=hash. The available options are: primary-key, partition-key, primary-partition-key, and auto. primary-key: Shuffle by primary key partition-key: Shuffle by partition key primary-partition-key: Shuffle by primary key and partition key auto: If the table has both a primary key and partitions, use primary-partition-key; if the table has a primary key but no partitions, use primary-key; if the table has partitions but no primary key, use partition-key. Otherwise, use none. | -| properties.pulsar.admin.adminUrl | (none) | String | If the LogStore is Pulsar and it is required for querying, it must be filled in, otherwise it can be left empty. | The HTTP URL for Pulsar Admin is in the format: http://my-broker.example.com:8080. | -| properties.* | (none) | String | No | Parameters for Logstore: For Logstore with Kafka ('log-store.type'='kafka' default value), all other parameters supported by the Kafka Consumer can be set by prefixing properties. to the parameter name, for example, 'properties.batch.size'='16384'. The complete parameter information can be found in the [Kafka official documentation](https://kafka.apache.org/documentation/#consumerconfigs); For LogStore set to Pulsar ('log-store.type'='pulsar'), all relevant configurations supported by Pulsar can be set by prefixing properties. to the parameter name, for example: 'properties.pulsar.client.requestTimeoutMs'='60000'. For complete parameter information, refer to the [Flink-Pulsar-Connector documentation](https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/connectors/datastream/pulsar) | -| other table parameters | (none) | String | No | All parameters of an Amoro table can be dynamically modified through SQL Hints, but they only take effect for this specific task. For the specific parameter list, please refer to the [Table Configuration](../configurations/). For permissions-related configurations on the catalog, they can also be configured in Hint using parameters such as [properties.auth.XXX in catalog DDL](../flink-ddl/#flink-sql) | - -## Lookup join with SQL - -A Lookup Join is used to enrich a table with data that is queried from Amoro Table. The join requires one table to have a processing time attribute and the other table to be backed by a lookup source connector. - -The following example shows the syntax to specify a lookup join. - -```sql --- amoro flink connector and can be used for lookup joins -CREATE TEMPORARY TABLE Customers ( - id INT, - name STRING, - country STRING, - zip STRING -) WITH ( - 'connector' = 'mixed-format', - 'metastore.url' = '', - 'mixed-format.catalog' = '', - 'mixed-format.database' = '', - 'mixed-format.table' = '', - 'lookup.cache.max-rows' = '' -); - --- Create a temporary left table, like from kafka -CREATE TEMPORARY TABLE orders ( - order_id INT, - total INT, - customer_id INT, - proc_time AS PROCTIME() -) WITH ( - 'connector' = 'kafka', - 'topic' = '...', - 'properties.bootstrap.servers' = '...', - 'format' = 'json' - ... -); - --- enrich each order with customer information -SELECT o.order_id, o.total, c.country, c.zip -FROM Orders AS o -JOIN Customers FOR SYSTEM_TIME AS OF o.proc_time AS c -ON o.customer_id = c.id; -``` - -Lookup Options - -| Key | Default Value | Type | Required | Description | -|----------------------------------------------------|---------------|----------|----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| lookup.cache.max-rows | 10000 | Long | No | The maximum number of rows in the lookup cache, beyond which the oldest row will expire. | -| lookup.reloading.interval | 10s | Duration | No | Configuration option for specifying the interval in seconds to reload lookup data in RocksDB. | -| lookup.cache.ttl-after-write | 0s | Duration | No | The TTL after which the row will expire in the lookup cache. | -| rocksdb.auto-compactions | false | Boolean | No | Enable automatic compactions during the initialization process. After the initialization completed, will enable the auto_compaction. | -| rocksdb.writing-threads | 5 | Int | No | Writing data into rocksDB thread number. | -| rocksdb.block-cache.capacity | 1048576 | Long | No | Use the LRUCache strategy for blocks, the size of the BlockCache can be configured based on your memory requirements and available system resources. | -| rocksdb.block-cache.numShardBits | -1 | Int | No | Use the LRUCache strategy for blocks. The cache is sharded to 2^numShardBits shards, by hash of the key. Default is -1, means it is automatically determined: every shard will be at least 512KB and number of shard bits will not exceed 6. | -| other table parameters | (none) | String | No | All parameters of an Amoro table can be dynamically modified through SQL Hints, but they only take effect for this specific task. For the specific parameter list, please refer to the [Table Configuration](../configurations/). For permissions-related configurations on the catalog, they can also be configured in Hint using parameters such as [properties.auth.XXX in catalog DDL](../flink-ddl/#flink-sql) | diff --git a/amoro-docs/content/engines/flink/flink-ds.md b/amoro-docs/content/engines/flink/flink-ds.md deleted file mode 100644 index de3af02..0000000 --- a/amoro-docs/content/engines/flink/flink-ds.md +++ /dev/null @@ -1,313 +0,0 @@ ---- -title: "Flink DataStream" -url: flink-datastream -aliases: - - "flink/datastream" -menu: - main: - parent: Flink - weight: 400 ---- - -# Flink DataStream - -## Add maven dependency -To add a dependency on Mixed-format flink connector in Maven, add the following to your pom.xml: -```xml - - ... - - org.apache.amoro - - amoro-format-mixed-flink-runtime-${flink.minor-version} - - ${amoro-format-mixed-flink.version} - - ... - -``` - -## Reading with DataStream -Amoro supports reading data in Batch or Streaming mode through Java API. - -### Batch mode -Using Batch mode to read the full and incremental data in the FileStore. - -- Non-primary key tables support reading full data in batch mode, snapshot data with a specified snapshot-id or timestamp, and incremental data with a specified snapshot interval. -- The primary key table temporarily only supports reading the current full amount and later CDC data. - -```java -import org.apache.amoro.flink.InternalCatalogBuilder; -import org.apache.amoro.flink.table.FlinkSource; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.table.TableIdentifier; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; - -import java.util.HashMap; -import java.util.Map; - -public class Main { - public static void main(String[] args) throws Exception { - StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(); - InternalCatalogBuilder catalogBuilder = - InternalCatalogBuilder - .builder() - .metastoreUrl("thrift://:/"); - - TableIdentifier tableId = TableIdentifier.of("catalog_name", "database_name", "test_table"); - MixedFormatTableLoader tableLoader = MixedFormatTableLoader.of(tableId, catalogBuilder); - - Map properties = new HashMap<>(); - // Default is true - properties.put("streaming", "false"); - - DataStream batch = - FlinkSource.forRowData() - .env(env) - .tableLoader(tableLoader) - .properties(properties) - .build(); - - // print all data read - batch.print(); - - // Submit and execute the task - env.execute("Test Mixed-format table batch read"); - } -} -``` - -The map properties contain below keys, **currently only valid for non-primary key tables**: - -| Key|Default|Type|Required|Description| -|--- |--- |--- |--- |--- | -|case-sensitive|false|Boolean|No|Case-sensitive| -|snapshot-id |(none)|Long|No|Read the full amount of data of the specified snapshot, only effective when streaming is false or not configured| -|as-of-timestamp|(none)|String|No|Read the last time less than the timestamp The full amount of snapshot data is valid only when streaming is false or not configured | -|start-snapshot-id|(none)|String|No| When streaming is false, end-snapshot-id needs to be used to read the two intervals Incremental data (snapshot1, snapshot2]. When streaming is true, read the incremental data after the snapshot, if not specified, read the incremental data after the current snapshot (not including the current one) | -|end-snapshot-id|(none )|String|No|Need to cooperate with start-snapshot-id to read incremental data in two intervals (snapshot1, snapshot2]| - -### Streaming mode -Amoro supports reading incremental data in FileStore or LogStore through Java API in Streaming mode - -### Streaming mode (LogStore) -```java -import org.apache.amoro.flink.InternalCatalogBuilder; -import org.apache.amoro.flink.read.source.log.kafka.LogKafkaSource; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.shade.org.apache.iceberg.Schema; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.TableIdentifier; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; - - -public class Main { - public static void main(String[] args) throws Exception { - StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(); - InternalCatalogBuilder catalogBuilder = - InternalCatalogBuilder - .builder() - .metastoreUrl("thrift://:/"); - - TableIdentifier tableId = TableIdentifier.of("catalog_name", "database_name", "test_table"); - MixedFormatTableLoader tableLoader = MixedFormatTableLoader.of(tableId, catalogBuilder); - - MixedTable table = MixedFormatUtils.loadMixedTable(tableLoader); - // Read table All fields. If you only read some fields, you can construct the schema yourself, for example: - // Schema userSchema = new Schema(new ArrayList() {{ - // add(Types.NestedField.optional(0, "f_boolean", Types.BooleanType.get())); - // add(Types.NestedField.optional(1, "f_int", Types.IntegerType.get())); - // }}); - Schema schema = table.schema(); - - // -----------Hidden Kafka-------------- - LogKafkaSource source = LogKafkaSource.builder(schema, table.properties()).build(); - - DataStream stream = env.fromSource(source, WatermarkStrategy.noWatermarks(), "Log Source"); - - // Print all the read data - stream.print(); - - // Submit and execute the task - env.execute("Test Mixed-format table streaming read"); - } -} -``` - -### Streaming mode (FileStore) -```java -import org.apache.amoro.flink.InternalCatalogBuilder; -import org.apache.amoro.flink.table.FlinkSource; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.table.TableIdentifier; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; - -import java.util.HashMap; -import java.util.Map; - - -public class Main { - public static void main(String[] args) throws Exception { - StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(); - InternalCatalogBuilder catalogBuilder = - InternalCatalogBuilder - .builder() - .metastoreUrl("thrift://:/"); - - TableIdentifier tableId = TableIdentifier.of("catalog_name", "database_name", "test_table"); - MixedFormatTableLoader tableLoader = MixedFormatTableLoader.of(tableId, catalogBuilder); - - Map properties = new HashMap<>(); - // Default value is true - properties.put("streaming", "true"); - - DataStream stream = - FlinkSource.forRowData() - .env(env) - .tableLoader(tableLoader) - .properties(properties) - .build(); - - // Print all read data - stream.print(); - - // Submit and execute the task - env.execute("Test Mixed-format table streaming Read"); - } -} -``` -DataStream API supports reading primary key tables and non-primary key tables. The configuration items supported by properties can refer to Querying With SQL [chapter Hint Option](../flink-dml/) - -## Writing with DataStream -Amoro table supports writing data to LogStore or FileStore through Java API - -### Overwrite data -Amoro table currently Only supports the existing data in the dynamic Overwrite table of the non-primary key table - -```java -import org.apache.amoro.flink.InternalCatalogBuilder; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.write.FlinkSink; -import org.apache.amoro.table.TableIdentifier; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.RowData; - - - -public class Main { - public static void main(String[] args) throws Exception { - // Build your data stream - DataStream input = null; - StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(); - InternalCatalogBuilder catalogBuilder = - InternalCatalogBuilder - .builder() - .metastoreUrl("thrift://:/"); - - TableIdentifier tableId = TableIdentifier.of("catalog_name", "database_name", "test_table"); - MixedFormatTableLoader tableLoader = MixedFormatTableLoader.of(tableId, catalogBuilder); - - TableSchema flinkSchema = TableSchema.builder() - .field("id", DataTypes.INT()) - .field("name", DataTypes.STRING()) - .field("op_time", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) - .build(); - - FlinkSink - .forRowData(input) - .tableLoader(tableLoader) - .overwrite(true) - .flinkSchema(flinkSchema) - .build(); - - // Submit and execute the task - env.execute("Test Mixed-format table overwrite"); - } -} -``` - -### Appending data -For the Amoro table, it supports specifying to write data to FileStore or LogStore through Java API. - -```java -import org.apache.amoro.flink.InternalCatalogBuilder; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.flink.write.FlinkSink; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.TableIdentifier; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.RowData; - - - -public class Main { - public static void main(String[] args) throws Exception { - // Build your data stream - DataStream input = null; - StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(); - InternalCatalogBuilder catalogBuilder = - InternalCatalogBuilder - .builder() - .metastoreUrl("thrift://:/"); - - TableIdentifier tableId = TableIdentifier.of("catalog_name", "database_name", "test_table"); - MixedFormatTableLoader tableLoader = MixedFormatTableLoader.of(tableId, catalogBuilder); - - TableSchema flinkSchema = TableSchema.builder() - .field("id", DataTypes.INT()) - .field("name", DataTypes.STRING()) - .field("op_time", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) - .build(); - - MixedTable table = MixedFormatUtils.loadMixedTable(tableLoader); - - table.properties().put("mixed-format.emit.mode", "log,file"); - - FlinkSink - .forRowData(input) - .table(table) - .tableLoader(tableLoader) - .flinkSchema(flinkSchema) - .build(); - - env.execute("Test Mixed-format table append"); - } -} -``` -The DataStream API supports writing to primary key tables and non-primary key tables. The configuration items supported by properties can refer to Writing With SQL [chapter Hint Options](../flink-dml/) - -> **TIPS** -> -> mixed-format.emit.mode contains log, you need to configure log-store.enabled = true [Enable Log Configuration](../flink-dml/) -> -> mixed-format.emit.mode When file is included, the primary key table will only be written to ChangeStore, and the non-primary key table will be directly written to BaseStore. \ No newline at end of file diff --git a/amoro-docs/content/engines/flink/flink-get-started.md b/amoro-docs/content/engines/flink/flink-get-started.md deleted file mode 100644 index 0a28c40..0000000 --- a/amoro-docs/content/engines/flink/flink-get-started.md +++ /dev/null @@ -1,133 +0,0 @@ ---- -title: "Flink Getting Started" -url: flink-getting-started -aliases: - - "flink/getting-started" -menu: - main: - parent: Flink - weight: 100 ---- - -# Flink Getting Started - -## Iceberg format - -The Iceberg Format can be accessed using the Connector provided by Iceberg. -Refer to the documentation at [Iceberg Flink user manual](https://iceberg.apache.org/docs/latest/flink-connector/) -for more information. - -## Paimon format - -The Paimon Format can be accessed using the Connector provided by Paimon. -Refer to the documentation at [Paimon Flink user manual](https://paimon.apache.org/docs/master/engines/flink/) -for more information. - -## Mixed format -The Apache Flink engine can process Amoro table data in batch and streaming mode. The Flink on Amoro connector provides the ability to read and write to the Amoro data lake while ensuring data consistency. To meet the high real-time data requirements of businesses, the Amoro data lake's underlying storage structure is designed with LogStore, which stores the latest changelog or append-only real-time data. - -Amoro integrates the DataStream API and Table API of [Apache Flink](https://flink.apache.org/) to facilitate the use of Flink to read data from Amoro tables or write data to Amoro tables. - -Flink Connector includes: - -- `Flink SQL Select` reads Amoro table data through Apache Flink SQL. -- `Flink SQL Insert` writes data to Amoro tables through Apache Flink SQL. -- `Flink SQL DDL` creates/modifies/deletes libraries and tables through Apache Flink DDL statements. -- `FlinkSource` reads Amoro table data through Apache Flink DS API. -- `FlinkSink` writes data to Amoro tables through Apache Flink DS API. -- `Flink Lookup Join` performs real-time read of Amoro table data for association calculation through Apache Flink Temporal Join grammar. - -The Amoro project can be self-compiled to obtain the runtime jar. - -`mvn clean package -pl ':amoro-mixed-flink-runtime-1.15' -am -DskipTests` - -The Flink Runtime Jar is located in the `amoro-format-mixed/amoro-format-mixed-flink/v1.15/amoro-format-mixed-flink-runtime-1.15/target` directory. - -## Environment preparation -Download Flink and related dependencies, and download Flink 1.15/1.16/1.17 as needed. Taking Flink 1.15 as an example: -```shell -# Replace version value with the latest Amoro version if needed -AMORO_VERSION=0.8.0-incubating -FLINK_VERSION=1.15.3 -FLINK_MAJOR_VERSION=1.15 -FLINK_HADOOP_SHADE_VERSION=2.7.5 -APACHE_FLINK_URL=archive.apache.org/dist/flink -MAVEN_URL=https://repo1.maven.org/maven2 -FLINK_CONNECTOR_URL=${MAVEN_URL}/org/apache/flink -AMORO_CONNECTOR_URL=${MAVEN_URL}/org/apache/amoro - -# Download FLink binary package -wget ${APACHE_FLINK_URL}/flink-${FLINK_VERSION}/flink-${FLINK_VERSION}-bin-scala_2.12.tgz -# Unzip Flink binary package -tar -zxvf flink-${FLINK_VERSION}-bin-scala_2.12.tgz - -cd flink-${FLINK_VERSION} -# Download Flink Hadoop dependency -wget ${FLINK_CONNECTOR_URL}/flink-shaded-hadoop-2-uber/${FLINK_HADOOP_SHADE_VERSION}-10.0/flink-shaded-hadoop-2-uber-${FLINK_HADOOP_SHADE_VERSION}-10.0.jar -# Download Flink Amoro Connector -wget ${AMORO_CONNECTOR_URL}/amoro-mixed-format-flink-runtime-${FLINK_MAJOR_VERSION}/${AMORO_VERSION}/amoro-mixed-format-flink-runtime-${FLINK_MAJOR_VERSION}-${AMORO_VERSION}.jar - -# Copy the necessary JAR files to the lib directory -mv flink-shaded-hadoop-2-uber-${FLINK_HADOOP_SHADE_VERSION}-10.0.jar lib -mv amoro-mixed-format-flink-runtime-${FLINK_MAJOR_VERSION}-${AMORO_VERSION}.jar lib -``` - -Modify Flink related configuration files: - -```shell -cd flink-1.15.3 -vim conf/flink-conf.yaml -``` -Modify the following settings: - -```yaml -# Increase the number of slots to run two streaming tasks simultaneously -taskmanager.numberOfTaskSlots: 4 -# Enable Checkpoint. Only when Checkpoint is enabled, the data written to the file is visible -execution.checkpointing.interval: 10s -``` - -Move the dependencies to the lib directory of Flink: - -```shell -# Used to create a socket connector for inputting CDC data via sockets. Not necessary for non-quickstart examples. -cp examples/table/ChangelogSocketExample.jar lib - -cp ../amoro-mixed-flink-runtime-${FLINK_MAJOR_VERSION}-${AMORO_VERSION}.jar lib -cp ../flink-shaded-hadoop-2-uber-${FLINK_HADOOP_SHADE_VERSION}-10.0.jar lib -``` - -### Mixed-Hive format -Starting from Amoro version 0.3.1, Mixed-Hive format is supported, and data in Amoro Mixed-Hive format tables can be read/written through Flink. When operating on Mixed-Hive format tables through Flink, the following points should be noted: - -1. Flink Runtime Jar does not include the content of the Jar packages that Hive depends on. You need to manually put the [Hive-dependent Jar package](https://repo1.maven.org/maven2/org/apache/hive/hive-exec/2.1.1/hive-exec-2.1.1.jar) in the flink/lib directory; -2. When creating partitioned tables, the partition field needs to be placed in the last column; when there are multiple partition fields, they need to be placed at the end; - -## Frequently Asked Questions - -**1. Data written to Amoro table is not visible** - -You need to enable Flink checkpoint and modify the [Flink checkpoint configuration](https://nightlies.apache.org/flink/flink-docs-release-1.12/deployment/config.html#execution-checkpointing-interval) in Flink conf. The data will only be committed during checkpoint. - -**2. When using Flink SQL-Client to read Amoro tables with write.upsert feature enabled, there are still duplicate primary key data** - -The query results obtained through Flink SQL-Client cannot provide MOR semantics based on primary keys. If you need to obtain merged results through Flink engine queries, you can write the content of Amoro tables to a MySQL table through JDBC connector for viewing. - -**3. When writing to Amoro tables with write.upsert feature enabled through SQL-Client under Flink 1.15, there are still duplicate primary key data** - -You need to execute the command `set table.exec.sink.upsert-materialize = none` in SQL-Client to turn off the upsert materialize operator generated upsert view. This operator will affect the AmoroWriter's generation of delete data when the write.upsert feature is enabled, causing duplicate primary key data to not be merged. \ No newline at end of file diff --git a/amoro-docs/content/engines/flink/using-logstore.md b/amoro-docs/content/engines/flink/using-logstore.md deleted file mode 100644 index 44db2f1..0000000 --- a/amoro-docs/content/engines/flink/using-logstore.md +++ /dev/null @@ -1,105 +0,0 @@ ---- -title: "Using Logstore" -url: flink-using-logstore -aliases: - - "flink/using-logstore" -menu: - main: - parent: Flink - weight: 500 ---- - -# Using Logstore -Due to the limitations of traditional offline data warehouse architectures in supporting real-time business needs, real-time data warehousing has experienced rapid evolution in recent years. In the architecture of real-time data warehousing, Apache Kafka is often used as the storage system for real-time data. However, this also brings about the issue of data disconnection between offline data warehouses. - -Developers often need to pay attention to data stored in HDFS as well as data in Kafka, which increases the complexity of business development. Therefore, Amoro proposes the addition of an optional parameter, "LogStore enabled" (`log-store.enabled`), to the table configuration. This allows for retrieving data with sub-second and minute-level latency by operating on a single table while ensuring the eventual consistency of data from both sources. - -## Real-Time data in LogStore -Amoro tables provide two types of storage: FileStore and LogStore. FileStore stores massive full data, while LogStore stores real-time incremental data. - -Real-time data can provide second-level data visibility and ensure data consistency without enabling LogStore transactions. - -Its underlying storage system can be connected to external message queuing middleware, currently supporting only Kafka and Pulsar. - -Users can enable LogStore by configuring the following parameters when creating an Amoro table. For specific configurations, please refer to [LogStore related configurations](../configurations/#logstore-configurations). - -## Overview - -| Flink | Kafka | -|------------|----------| -| Flink 1.15 | ✔ | -| Flink 1.16 | ✔ | -| Flink 1.17 | ✔ | - -Kafka as LogStore Version Description: - -| Flink Version | Kafka Versions | -|---------------| ----------------- | -| 1.15.x | 0.10.2.\*
0.11.\*
1.\*
2.\*
3.\* | -| 1.16.x | 0.10.2.\*
0.11.\*
1.\*
2.\*
3.\* | -| 1.17.x | 0.10.2.\*
0.11.\*
1.\*
2.\*
3.\* | - - - -### Prerequisites for using LogStore - -When creating an Amoro table, LogStore needs to be enabled. - -- You can create a table after selecting a specific Catalog on the Amoro [Dashboard](http://localhost:1630) - Terminal page - -```sql -CREATE TABLE db.log_table ( - id int, - name string, - ts timestamp, - primary key (id) -) using mixed_iceberg -tblproperties ( -"log-store.enabled" = "true", -"log-store.topic"="topic_log_test", -"log-store.address"="localhost:9092" -); -``` - -- You can also use Flink SQL to create tables in Flink-SQL-Client - -```sql --- First use the use catalog command to switch to the mixed-format catalog. -CREATE TABLE db.log_table ( - id int, - name string, - ts timestamp, - primary key (id) not enforced -) WITH ( - 'log-store.enabled' = 'true', - 'log-store.topic'='topic_log_test', - 'log-store.address'='localhost:9092'); -``` - -### Double write LogStore and FileStore - -![Introduce](../../images/flink/double-write.png) - -Amoro Connector writes data to LogStore and ChangeStore at the same time through double-write operations, without opening Kafka transactions to ensure data consistency between the two, because opening transactions will bring a few minutes of delay to downstream tasks (the specific delay time depends on upstream tasks checkpoint interval). - -```sql -INSERT INTO db.log_table /*+ OPTIONS('mixed-format.emit.mode'='log') */ -SELECT id, name, ts from sourceTable; -``` - -> Currently, only the Apache Flink engine implements the dual-write LogStore and FileStore. diff --git a/amoro-docs/content/engines/spark/spark-conf.md b/amoro-docs/content/engines/spark/spark-conf.md deleted file mode 100644 index 6afdb65..0000000 --- a/amoro-docs/content/engines/spark/spark-conf.md +++ /dev/null @@ -1,114 +0,0 @@ ---- -title: "Spark Configuration" -url: spark-configuration -aliases: - - "spark/configuration" -menu: - main: - parent: Spark - weight: 200 ---- - -# Spark Configuration - -## Catalogs configuration - -### Using Mixed-Format in a standalone catalog - -Starting from version 3.x, Spark supports configuring an independent Catalog. -If you want to use a Mixed-Format table in a standalone Catalog, you can create a mixed_catalog and load catalog -metadata from AMS with following properties: - -```properties -spark.sql.catalog.mixed_catalog=org.apache.amoro.spark.MixedFormatSparkCatalog -spark.sql.catalog.mixed_catalog.url=thrift://${AMS_HOST}:${AMS_PORT}/${AMS_CATALOG_NAME_HIVE} -``` - -Or create a mixed_catalog with local configurations with following properties: -```properties -spark.sql.catalog.mixed_catalog=org.apache.amoro.spark.MixedFormatSparkCatalog -# Configure mixed catalog type as you needed -spark.sql.catalog.mixed_catalog.type=hadoop -spark.sql.catalog.mixed_catalog.warehouse=/warehouse/hadoop_mixed_catalog -``` - -Then, execute the following SQL in the Spark SQL Client to switch to the corresponding catalog. - -```sql -use mixed_catalog; -``` - -Of course, you can also access Mixed-Format tables by directly using the triplet -`mixed_catalog.{db_name}.{table_name}`. - -You can also set Spark's default catalog to your configured catalog using the following properties. -In this way, you don't need to use the `use {catalog}` command to switch the default catalog. - -```properties -spark.sql.defaultCatalog=mixed_catalog -spark.sql.catalog.mixed_catalog=org.apache.amoro.spark.MixedFormatSparkCatalog -spark.sql.catalog.mixed_catalog.url=thrift://${AMS_HOST}:${AMS_PORT}/${AMS_CATALOG_NAME_HIVE} -``` - -In a standalone AmoroSparkCatalog scenario, only Mixed-Format tables can be created and accessed in the corresponding -catalog - -### Using Mixed-Format in session catalog - -If you want to access both existing Hive tables or Spark datasource tables and Mixed-Format tables in Spark, -you can use the AmoroSparkSessionCatalog as the implementation of the Spark default session catalog. -The configuration method is as follows. - -```properties -spark.sql.catalog.spark_catalog=org.apache.amoro.spark.MixedFormatSparkSessionCatalog -spark.sql.catalog.spark_catalog.url=thrift://${AMS_HOST}:${AMS_PORT}/${AMS_CATALOG_NAME_HIVE} -``` - -When using the `MixedFormatSparkSessionCatalog` as the implementation of the `spark_catalog`, it behaves as follows - -- Load Table: When resolving a `db_name.table_name` identifier, it will load the table metadata by Spark's built-in - session catalog implementation, and then checking the MixedFormat flag defined in table properties. If the table has - the MixedFormat flag, it will be loaded by `MixedFormatSparkCatalog` again. - -- Create Table: The behavior of `CREATE TABLE` is determined by the `using {provider}` clause in the DDL statement. If - the clause contains `using mixed_iceberg` or `using mixed_hive`, a Mixed-Format table will be created. Otherwise, the default Spark implementation - will be used to create the table. - -When using the `MixedFormatSparkSessionCatalog`, there are several points to keep in mind: - -- `MixedFormatSparkSessionCatalog` can only be configured under the `spark_catalog` -- The `spark.sql.catalogImplementation` must be configured as `HIVE` -- Catalogs registered on AMS must use a Metastore of the `Hive` type. - -## The high availability configuration - -If AMS is configured with high availability, you can configure the `spark.sql.catalog.{catalog_name}.url` property in -the following way to achieve higher availability. - -```properties -spark.sql.catalog.mixed_catalog=org.apache.amoro.spark.MixedFormatSparkCatalog -spark.sql.catalog.mixed_catalog.url=zookeeper://{zookeeper-endpoint-list}/{cluster-name}/{catalog-name} -``` - -Among above: - -- zookeeper-endpoint-list: a list of host:port pairs separated by commas. A valid value could - be `192.168.1.1:2181,192.168.1.2:2181,192.168.1.3:2181` -- cluster-name: is the value of `ams.cluster.name` configured in the configuration file `config.yml` of AMS, which is - used to identify the user space on ZooKeeper. -- catalog-name: the name of the Catalog on AMS. diff --git a/amoro-docs/content/engines/spark/spark-ddl.md b/amoro-docs/content/engines/spark/spark-ddl.md deleted file mode 100644 index 487c935..0000000 --- a/amoro-docs/content/engines/spark/spark-ddl.md +++ /dev/null @@ -1,303 +0,0 @@ ---- -title: "Spark DDL" -url: spark-ddl -aliases: - - "spark/ddl" -menu: - main: - parent: Spark - weight: 300 ---- - -# Spark DDL - -## CREATE TABLE - -To create an MixedFormat table under an Amoro Catalog, you can use `using mixed_iceberg` or `using mixed_hive` to specify the provider in the -`CREATE TABLE` statement. If the Catalog type is Hive, the created table will be a Hive-compatible table. - -```sql -CREATE TABLE mixed_catalog.db.sample ( - id bigint COMMENT "unique id", - data string -) USING mixed_iceberg -``` - -### PRIMARY KEY - -You can use `PRIMARY KEY` in the `CREATE TABLE` statement to specify the primary key column. -MixedFormat ensures the uniqueness of the primary key column through MOR (Merge on Read) and Self-Optimizing. - -```sql -CREATE TABLE mixed_catalog.db.sample ( - id bigint COMMENT "unique id", - data string , - PRIMARY KEY (id) -) USING mixed_iceberg -``` - -### PARTITIONED BY - -Using `PARTITIONED BY` in the `CREATE TABLE` statement to create a table with partition spec. - -```sql -CREATE TABLE mixed_catalog.db.sample ( - id bigint, - data string, - category string) -USING mixed_iceberg -PARTITIONED BY (category) -``` - -In the `PARTITIONED BY` clause, you can define partition expressions, and Mixed-Iceberg format supports all partition -expressions in Iceberg. - -```sql -CREATE TABLE mixed_catalog.db.sample ( - id bigint, - data string, - category string, - ts timestamp, - PRIMARY KEY (id) ) -USING mixed_iceberg -PARTITIONED BY (bucket(16, id), days(ts), category) -``` - -Supported transformations are: - -* years(ts): partition by year -* months(ts): partition by month -* days(ts) or date(ts): equivalent to dating partitioning -* hours(ts) or date_hour(ts): equivalent to dating and hour partitioning -* bucket(N, col): partition by hashed value mod N buckets -* truncate(L, col): partition by value truncated to L - - Strings are truncated to the given length - - Integers and longs truncate to bins: truncate(10, i) produces partitions 0, 10, 20, 30, … - -> Mixed-Hive format doesn't support transform. - -## CREATE TABLE ... AS SELECT - -``` -CREATE TABLE mixed_catalog.db.sample -USING mixed_iceberg -AS SELECT ... -``` - -> The `CREATE TABLE ... AS SELECT` syntax is used to create a table and write the query results to the table. Primary -> keys, partitions, and properties are not inherited from the source table and need to be configured separately. - -> You can enable uniqueness check for the primary key in the source table by setting set -> `spark.sql.mixed-format.check-source-data-uniqueness.enabled = true` in Spark SQL. If there are duplicate primary keys, an -> error will be raised during the write operation. - - -You can use the following syntax to create a table with primary keys, partitions, and properties: - -``` -CREATE TABLE mixed_catalog.db.sample -PRIMARY KEY(id) USING mixed_iceberg -PARTITIONED BY (pt) -TBLPROPERTIES (''prop1''=''val1'', ''prop2''=''val2'') -AS SELECT ... -``` - -{{< hint info >}} -In the current version, `CREATE TABLE ... AS SELECT` does not provide atomicity guarantees. -{{< /hint >}} - -## CREATE TABLE ... LIKE - -The `CREATE TABLE ... LIKE` syntax copies the structure of a table, including primary keys and partitions, to a new -table, but it does not copy the data. - -``` -USE mixed_catalog; -CREATE TABLE db.sample -LIKE db.sample2 -USING mixed_iceberg -TBLPROPERTIES ('owner'='xxxx'); -``` - -> Since `PRIMARY KEY` is not a standard Spark syntax, if the source table is a MixedFormat table with primary keys, the -> new table can copy the schema information with the primary keys. Otherwise, only schema could be copied. - -{{< hint info >}} -`Create Table Like` only supports the binary form of `db.table` and in the same catalog -{{< /hint >}} - -## REPLACE TABLE ... AS SELECT - -> The `REPLACE TABLE ... AS SELECT` syntax only supports tables without primary keys in the current version. - -``` -REPLACE TABLE mixed_catalog.db.sample -USING mixed_iceberg -AS SELECT ... -``` - -> In the current version, `REPLACE TABLE ... AS SELECT` does not provide atomicity guarantees. - -## DROP TABLE - -```sql -DROP TABLE mixed_catalog.db.sample; -``` - -## TRUNCATE TABLE - -The `TRUNCATE TABLE` statement could delete all data in the table. - -```sql -TRUNCATE TABLE mixed_catalog.db.sample; -``` - -## ALTER TABLE - -The ALTER TABLE syntax supported by Mixed-Format includes: - -* ALTER TABLE ... SET TBLPROPERTIES -* ALTER TABLE ... ADD COLUMN -* ALTER TABLE ... RENAME COLUMN -* ALTER TABLE ... ALTER COLUMN -* ALTER TABLE ... DROP COLUMN -* ALTER TABLE ... DROP PARTITION - -### ALTER TABLE ... SET TBLPROPERTIES - -```sql -ALTER TABLE mixed_catalog.db.sample SET TBLPROPERTIES ( - 'read.split.target-size'='268435456' -); -``` - -Using `UNSET` to remove properties: - -```sql -ALTER TABLE mixed_catalog.db.sample UNSET TBLPROPERTIES ('read.split.target-size'); -``` - -### ALTER TABLE ... ADD COLUMN - -```sql -ALTER TABLE mixed_catalog.db.sample -ADD COLUMNS ( - new_column string comment 'new_column docs' - ); -``` - -You can add multiple columns at once, separated by commas. - -```sql --- create a struct column -ALTER TABLE mixed_catalog.db.sample -ADD COLUMN point struct; - --- add a field to the struct -ALTER TABLE mixed_catalog.db.sample -ADD COLUMN point.z double; -``` - -```sql --- create a nested array column of struct -ALTER TABLE mixed_catalog.db.sample -ADD COLUMN points array>; - --- add a field to the struct within an array. Using keyword 'element' to access the array's element column. -ALTER TABLE mixed_catalog.db.sample -ADD COLUMN points.element.z double; -``` - -```sql --- create a map column of struct key and struct value -ALTER TABLE mixed_catalog.db.sample -ADD COLUMN points map, struct>; - --- add a field to the value struct in a map. Using keyword 'value' to access the map's value column. -ALTER TABLE mixed_catalog.db.sample -ADD COLUMN points.value.b int; -``` - -You can add columns at any position by using the `FIRST` or `AFTER` clause. - -```sql -ALTER TABLE mixed_catalog.db.sample -ADD COLUMN new_column bigint AFTER other_column; -``` - -```sql -ALTER TABLEmixed_catalog.db.sample -ADD COLUMN nested.new_column bigint FIRST; -``` - -### ALTER TABLE ... RENAME COLUMN - -```sql -ALTER TABLE mixed_catalog.db.sample RENAME COLUMN data TO payload; -``` - -### ALTER TABLE ... ALTER COLUMN - -`"`ALTER COLUMN` can be used to widen types, make fields nullable, set comments, and reorder fields. - -```sql -ALTER TABLE mixed_catalog.db.sample ALTER COLUMN measurement TYPE double; -``` - -To add or remove columns from a structure, use `ADD COLUMN` or `DROP COLUMN` with nested column names. - -Column comments can also be updated using `ALTER COLUMN`. - -```sql -ALTER TABLE mixed_catalog.db.sample ALTER COLUMN measurement TYPE double COMMENT 'unit is bytes per second'; -ALTER TABLE mixed_catalog.db.sample ALTER COLUMN measurement COMMENT 'unit is kilobytes per second'; -``` - -You can use the `FIRST` and `AFTER` clauses to reorder top-level or nested columns within a structure. - -```sql -ALTER TABLE mixed_catalog.db.sample ALTER COLUMN col FIRST; -``` - -```sql -ALTER TABLE mixed_catalog.db.sample ALTER COLUMN nested.col AFTER other_col; -``` - -### ALTER TABLE ... DROP COLUMN - -```sql -ALTER TABLE mixed_catalog.db.sample DROP COLUMN id; -ALTER TABLE mixed_catalog.db.sample DROP COLUMN point.z; -``` - -### ALTER TABLE ... DROP PARTITION - -```sql -ALTER TABLE mixed_catalog.db.sample DROP IF EXISTS PARTITION (dt=2022); -``` - -## DESC TABLE - -`DESCRIBE TABLE` returns basic metadata information about a table, including the primary key information for tables that -have a primary key - -```sql - { DESC | DESCRIBE } TABLE mixed_catalog.db.sample; -``` diff --git a/amoro-docs/content/engines/spark/spark-get-started.md b/amoro-docs/content/engines/spark/spark-get-started.md deleted file mode 100644 index 7bc7dd3..0000000 --- a/amoro-docs/content/engines/spark/spark-get-started.md +++ /dev/null @@ -1,153 +0,0 @@ ---- -title: "Spark Getting Started" -url: spark-getting-started -aliases: - - "spark/getting-started" -menu: - main: - parent: Spark - weight: 100 ---- - -# Spark Getting Started -# Iceberg Format - -The Iceberg Format can be accessed using the Connector provided by Iceberg. -Refer to the documentation at [Iceberg Spark Connector](https://iceberg.apache.org/docs/latest/getting-started/) -for more information. - -# Paimon Format - -The Paimon Format can be accessed using the Connector provided by Paimon. -Refer to the documentation at [Paimon Spark Connector](https://paimon.apache.org/docs/master/engines/spark3/) -for more information. - -# Mixed Format - - -To use Amoro in a Spark shell, use the --packages option: - -```bash -spark-shell --packages org.apache.amoro:amoro-mixed-spark-3.3-runtime:0.7.0 -``` - -> If you want to include the connector in your Spark installation, add the `amoro-mixed-spark-3.3-runtime` Jar to -> Spark's `jars` folder. - -## Adding catalogs - -``` -${SPARK_HOME}/bin/spark-sql \ - --conf spark.sql.extensions=org.apache.amoro.spark.MixedFormatSparkExtensions \ - --conf spark.sql.catalog.local_catalog=org.apache.amoro.spark.MixedFormatSparkCatalog \ - --conf spark.sql.catalog.local_catalog.url=thrift://${AMS_HOST}:${AMS_PORT}/${AMS_CATALOG_NAME} -``` - -> Amoro manages the Catalog through AMS, and Spark catalog needs to be mapped to Amoro Catalog via URL, -> in the following format: -> `thrift://${AMS_HOST}:${AMS_PORT}/${AMS_CATALOG_NAME}`, -> The mixed-format-spark-connector will automatically download the Hadoop site configuration file through -> the thrift protocol for accessing the HDFS cluster - -> -> The AMS_PORT is the port number of the AMS service's thrift API interface, with a default value of 1260 -> The AMS_CATALOG_NAME is the name of the Catalog you want to access on AMS. - -Regarding detailed configurations for Spark, please refer to [Spark Configurations](../spark-configuration/) - - -## Creating a table - -In Spark SQL command line, you can execute a create table command using the `CREATE TABLE` statement. - -Before executing a create table operation, please make sure to create the `database` first. - -``` --- switch to mixed catalog defined in spark conf -use local_catalog; - --- create databsae first -create database if not exists test_db; -``` - -Then switch to the newly created database and perform the create table operation. - -``` -use test_db; - --- create a table with 3 columns -create table test1 (id int, data string, ts timestamp) using mixed_iceberg; - --- create a table with hidden partition -create table test2 (id int, data string, ts timestamp) using mixed_iceberg partitioned by (days(ts)); - --- create a table with hidden partition and primary key -create table test3 (id int, data string, ts timestamp, primary key(id)) using mixed_iceberg partitioned by (days(ts)); -``` - -For more information on Spark DDL related to tables, please refer to [Spark DDL](../spark-ddl/) - -## Writing to the table - -If you are using Spark SQL, you can use the `INSERT OVERWRITE` or `INSERT` SQL statement to write data to an Amoro table. - -``` --- insert values into unkeyed table -insert into test2 values -( 1, "aaa", timestamp('2022-1-1 00:00:00')), -( 2, "bbb", timestamp('2022-1-2 00:00:00')), -( 3, "bbb", timestamp('2022-1-3 00:00:00')); - --- dynamic overwrite table -insert overwrite test3 values -( 1, "aaa", timestamp('2022-1-1 00:00:00')), -( 2, "bbb", timestamp('2022-1-2 00:00:00')), -( 3, "bbb", timestamp('2022-1-3 00:00:00')); -``` - - -> If you are using Static Overwrite, you cannot define transforms on partition fields. - -Alternatively, you can use the DataFrame API to write data to an Amoro table within a JAR job. - -``` -val df = spark.read().load("/path-to-table") -df.writeTo('test_db.table1').overwritePartitions() -``` - -For more information on writing to tables, please refer to [Spark Writes](../spark-writes/) - -## Reading from the table - -To query the table using `SELECT` SQL statements - -``` -select count(1) as count, data -from test2 -group by data; -``` - -For table with primary keys defined, you can query on `ChangeStore` by `.change` - -``` -select count(1) as count, data -from test_db.test3.change group by data; -``` - - -For more information on reading from tables, please refer to [Spark Queries](../spark-queries/) diff --git a/amoro-docs/content/engines/spark/spark-queries.md b/amoro-docs/content/engines/spark/spark-queries.md deleted file mode 100644 index b2111da..0000000 --- a/amoro-docs/content/engines/spark/spark-queries.md +++ /dev/null @@ -1,76 +0,0 @@ ---- -title: "Spark Queries" -url: spark-queries -aliases: - - "spark/queries" -menu: - main: - parent: Spark - weight: 400 ---- - -# Spark Queries -## Querying with SQL - -### Querying Mixed-Format table by merge on read - -Using `Select` statement to query on Mixed-Format tables. - -```sql -SELECT * FROM mixed_catalog.db.sample -``` - -The Mixed-Format connector will merge the data from `BaseStore` and `ChangeStore`. - -### Query on change store - -For a Mixed-Format table with primary keys. you can query on `ChangeStore` by `.change`. - -```sql -SELECT * FROM mixed_catalog.db.sample.change - -+---+----+----+---------------+------------+--------------+ -| id|name|data|_transaction_id|_file_offset|_change_action| -+---+----+----+---------------+------------+--------------+ -| 1|dddd|abcd| 3| 1| INSERT| -| 1|dddd|abcd| 3| 2| DELETE| -+---+----+----+---------------+------------+--------------+ -``` - -The addition columns are: - -- _transaction_id: The transaction ID allocated by AMS during data write is assigned per SQL execution in batch mode and - per checkpoint in streaming mode. -- _file_offset:The order of data written with the same `_transaction_id`. -- _change_action:The type of change record, `INSERT` or `DELETE`. - -## Querying with DataFrames - -You can read the Mixed-Format table by Spark DataFrames: - -```scala -val df = spark.read.table("mixed_catalog.db.sample") -df.count -``` - -And visit the `ChangeStore` by `.change`. - -```scala -val df = spark.read.table("mixed_catalog.db.sample.change") -df.count -``` diff --git a/amoro-docs/content/engines/spark/spark-writes.md b/amoro-docs/content/engines/spark/spark-writes.md deleted file mode 100644 index 5b914bb..0000000 --- a/amoro-docs/content/engines/spark/spark-writes.md +++ /dev/null @@ -1,209 +0,0 @@ ---- -title: "Spark Writes" -url: spark-writes -aliases: - - "spark/writes" -menu: - main: - parent: Spark - weight: 500 ---- - -# Spark Writes -## Writing with SQL - -### INSERT OVERWRITE - -`INSERT OVERWRITE` can replace the partition in a table with the results of a query. - -The default overwrite mode of Spark is `Static`, you can change the overwrite mode by - -``` -SET spark.sql.sources.partitionOverwriteMode=dynamic -``` - -To demonstrate the behavior of dynamic and static overwrites, a test table is defined using the following DDL: - -```sql -CREATE TABLE mixed_catalog.db.sample ( - id int, - data string, - ts timestamp, - primary key (id)) -USING mixed_iceberg -PARTITIONED BY (days(ts)) -``` - -When Spark's overwrite mode is dynamic, the partitions of the rows generated by the SELECT query will be replaced. - -```sql -INSERT OVERWRITE mixed_catalog.db.sample values -(1, 'aaa', timestamp(' 2022-1-1 09:00:00 ')), -(2, 'bbb', timestamp(' 2022-1-2 09:00:00 ')), -(3, 'ccc', timestamp(' 2022-1-3 09:00:00 ')) -``` - -When Spark's overwrite mode is static, the PARTITION clause will be translated into the result set of the SELECT from -the table. If the PARTITION clause is omitted, all partitions will be replaced. - -```sql -INSERT OVERWRITE mixed_catalog.db.sample -partition( dt = '2021-1-1') values -(1, 'aaa'), (2, 'bbb'), (3, 'ccc') -``` - -> In Static mode, it is not supported to define transforms on partitioning columns. - -> You can enable uniqueness check of the primary key on the source table by setting -> `spark.sql.mixed-format.check-source-data-uniqueness.enabled = true` in SPARK SQL. If there are duplicate primary keys, -> an error will be thrown during the write operation. - -### INSERT INTO - -To append new data to a table, use `INSERT INTO`. - -```sql -INSERT INTO mixed_catalog.db.sample VALUES (1, 'a'), (2, 'b') - -INSERT INTO prod.db.table SELECT ... -``` - -#### Upsert to table with primary keys. - -To add new data to a table with a primary key, you can control whether to enable the `UPSERT` function by setting -the `write.upsert.enabled parameter`. - -When `UPSERT` is enabled, if a row with the same primary key already exists, an `UPDATE` operation will be performed, -and if it does not exist, an INSERT operation will be performed. - -When `UPSERT` is disabled, only `INSERT` operation will be performed, -even if there are rows with the same primary key in the table. - -```sql -CREATE TABLE mixed_catalog.db.keyedTable ( - id int, - data string, - primary key (id)) -USING mixed_iceberg -TBLPROPERTIES ('write.upsert.enabled' = 'true') -``` - -```sql -INSERT INTO mixed_catalog.db.keyedTable VALUES (1, 'a'), (2, 'b') - -INSERT INTO prod.db.keyedTable SELECT ... -``` - -> You can enable uniqueness check of the primary key on the source table by setting -> `spark.sql.mixed-format.check-source-data-uniqueness.enabled = true` in SPARK SQL. If there are duplicate primary keys, -> an error will be thrown during the write operation. -> -### DELETE FROM - -The `DELETE FROM` statements delete rows from table. - -```sql -DELETE FROM mixed_catalog.db.sample -WHERE ts >= '2020-05-01 00:00:00' and ts < '2020-06-01 00:00:00' - -DELETE FROM mixed_catalog.db.sample -WHERE session_time < (SELECT min(session_time) FROM prod.db.good_events) - -DELETE FROM mixed_catalog.db.sample AS t1 -WHERE EXISTS (SELECT oid FROM prod.db.returned_orders WHERE t1.oid = oid) -``` - -### UPDATE - -The `UPDATE` statement modifies rows in the table. - -```sql -UPDATE mixed_catalog.db.sample -SET c1 = 'update_c1', c2 = 'update_c2' -WHERE ts >= '2020-05-01 00:00:00' and ts < '2020-06-01 00:00:00' - -UPDATE mixed_catalog.db.sample -SET session_time = 0, ignored = true -WHERE session_time < (SELECT min(session_time) FROM prod.db.good_events) - -UPDATE mixed_catalog.db.sample AS t1 -SET order_status = 'returned' -WHERE EXISTS (SELECT oid FROM prod.db.returned_orders WHERE t1.oid = oid) -``` - -### MERGE INTO - -```sql -MERGE INTO prod.db.target t -- a target table -USING (SELECT ...) s -- the source updates -ON t.id = s.id -- condition to find updates for target rows -WHEN ... -- updates -``` - -The `MERGE INTO` statement supports multi action `WHEN MATCHED ... THEN ...` to execute `UPDATE`, `DELETE`, `INSERT`. - -```sql - -MERGE INTO prod.db.target t -USING prod.db.source s -ON t.id = s.id -WHEN MATCHED AND s.op = 'delete' THEN DELETE -WHEN MATCHED AND t.count IS NULL AND s.op = 'increment' THEN UPDATE SET t.count = 0 -WHEN MATCHED AND s.op = 'increment' THEN UPDATE SET t.count = t.count + 1 -WHEN NOT MATCHED THEN INSERT * - -``` - -## Writing with DataFrames - -### Appending data - -Using `append()` to add data to a MixedFormat table. - -```sql -val data: DataFrame = ... -data.writeTo("mixed_catalog.db.sample").append() -``` - -### Overwriting data - -Using `overwritePartitions()` to overwriting data. - -```sql -val data: DataFrame = ... -data.writeTo("mixed_catalog.db.sample").overwritePartitions() -``` - -### Creating tables - -The `create()` will create a table and write data to the table, just like `CREATE TABLE AS SELECT` - -```sql -val data: DataFrame = ... -data.writeTo("mixed_catalog.db.sample").create() -``` - -The primary keys and partition keys could be specified by `partitionBy()` and `option("primary.keys", "'xxx'")`. - -```sql -val data: DataFrame = ... -data.write().format("mixed_iceberg") - .partitionBy("data") - .option("primary.keys", "'xxx'") - .save("mixed_catalog.db.sample") -``` diff --git a/amoro-docs/content/engines/trino.md b/amoro-docs/content/engines/trino.md deleted file mode 100644 index c5e9a89..0000000 --- a/amoro-docs/content/engines/trino.md +++ /dev/null @@ -1,124 +0,0 @@ ---- -title: "Trino" -url: trino -aliases: - - "trino" -menu: - main: - weight: 800 ---- - -# Trino - -## Iceberg format -Iceberg format can be accessed using the Iceberg Connector provided by Trino. -please refer to the documentation at [Iceberg Trino user manual](https://trino.io/docs/current/connector/iceberg.html#) for more information. - -## Paimon format -Paimon format can be accessed using the Paimon Connector provided by Trino. -please refer to the documentation at [Paimon Trino user manual](https://paimon.apache.org/docs/master/engines/trino/) for more information. - -## Mixed format -### Install - -- Create the {trino_home}/plugin/amoro directory in the Trino installation package, - and extract the contents of the amoro-trino package amoro-mixed-trino-xx.tar.gz to the {trino_home}/plugin/amoro directory. -- Configure the Catalog configuration file for Amoro in the {trino_home}/etc/catalog directory, for example: -```tex -connector.name=mixed-format -amoro.url=thrift://{ip}:{port}/{catalogName} -``` -- Configure the JVM configuration file for Trino in the {trino_home}/etc directory named `jvm.config` : -```tex ---add-exports=java.security.jgss/sun.security.krb5=ALL-UNNAMED -``` - -### Support SQL statement - -#### Query Table - -By adopting the Merge-On-Read approach to read Mixed Format, the latest data of the table can be read, for example: - -```sql -SELECT * FROM "{TABLE_NAME}" -``` - - - -#### Query BaseStore of Table - -Directly querying the BaseStore in a table with a primary key is supported. The BaseStore stores the stock data of the table, which is usually generated by batch job or optimization. -The queried data is static, and the query efficiency is very high, but the timeliness is not good. The syntax is as follows: - -```sql -SELECT * FROM "{TABLE_NAME}#BASE" -``` - - - -#### Query ChangeStore of Table - -Directly querying the ChangeStore in a table with a primary key is supported. The ChangeStore stores the stream and change data of the table, which is usually written in real time by stream job. -The change records of the table can be queried through the ChangeStore, and the expiry time of the data in the ChangeStore determines how long ago the change records can be queried. - -```sql -SELECT * FROM "{TABLE_NAME}#CHANGE" -``` - -Three additional columns will be included in the query result, which are: - -- _transaction_id: The transaction ID allocated by AMS when the data is written. In batch mode, it is allocated for each SQL execution, and in stream mode, it is allocated for each checkpoint. -- _file_offset:Indicates the order in which the data was written in the same batch of _transaction_id. -- _change_action:Indicates the type of data, which can be either INSERT or DELETE. - -#### Trino and Amoro Type Mapping: - -| Amoro type | Trino type | -| :------------- | :---------------------------- | -| `BOOLEAN` | `BOOLEAN` | -| `INT` | `INTEGER` | -| `LONG` | `BIGINT` | -| `FLOAT` | `REAL` | -| `DOUBLE` | `DOUBLE` | -| `DECIMAL(p,s)` | `DECIMAL(p,s)` | -| `DATE` | `DATE` | -| `TIME` | `TIME(6)` | -| `TIMESTAMP` | `TIMESTAMP(6)` | -| `TIMESTAMPTZ` | `TIMESTAMP(6) WITH TIME ZONE` | -| `STRING` | `VARCHAR` | -| `UUID` | `UUID` | -| `BINARY` | `VARBINARY` | -| `STRUCT(...)` | `ROW(...)` | -| `LIST(e)` | `ARRAY(e)` | -| `MAP(k,v)` | `MAP(k,v)` | - -### Trino uses proxy user to access Hadoop cluster. -By default, when Trino queries Amoro, it uses the Hadoop user configured in the [catalog creation](../managing-catalogs/#create-catalog) to access the Hadoop cluster. -To use Trino's user to access the Hadoop cluster, you need enable Hadoop impersonation by adding the mixed-format.hdfs.impersonation.enabled=true parameter in the Amoro catalog configuration file located in the {trino_home}/etc/catalog directory, as follows. - -```tex -connector.name=mixed-format -amoro.url=thrift://{ip}:{port}/{catalogName} -mixed-format.hdfs.impersonation.enabled=true -``` -`mixed-format.hdfs.impersonation.enabled` default false - -{{< hint info >}} -To use Hadoop impersonation, you need to enable the proxy feature for the Hadoop user configured in the [catalog creation](../managing-catalogs/#create-catalog) in the Hadoop cluster beforehand, -and make sure that it can proxy the Trino querying user. Please refer to [Hadoop Proxy User](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Superusers.html#Configurations) for more information. -{{}} \ No newline at end of file diff --git a/amoro-docs/content/formats/iceberg.md b/amoro-docs/content/formats/iceberg.md deleted file mode 100644 index 922bb9b..0000000 --- a/amoro-docs/content/formats/iceberg.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -title: "Iceberg" -url: iceberg-format -aliases: - - "formats/iceberg" -menu: - main: - parent: Formats - weight: 200 ---- - -# Iceberg Format - -Iceberg format refers to [Apache Iceberg](https://iceberg.apache.org) table, which is an open table format for large analytical datasets designed to provide scalable, efficient, and secure data storage and query solutions. -It supports data operations on multiple storage backends and provides features such as ACID transactions, multi-version control, and schema evolution, making data management and querying more flexible and convenient. - -With the release of [Iceberg v2](https://iceberg.apache.org/spec/), Iceberg addresses the shortcomings of row-level updates through the MOR (Merge On Read) mechanism, which better supports streaming updates. -However, as data and delete files are written, the read performance and availability of the table will decrease, and if not maintained in time, the table will quickly become unusable. - -Iceberg Format - -Starting from Amoro v0.4, Iceberg format including v1 and v2 is supported. Users only need to register Iceberg's catalog in Amoro to host the table for Amoro maintenance. For detailed operation steps, please refer to [Managing Catalogs](../managing-catalogs/). -Amoro maintains the performance and economic availability of Iceberg tables with minimal read/write costs through means such as small file merging, eq-delete file conversion to pos-delete files, duplicate data elimination, and file cleaning, and Amoro has no intrusive impact on the functionality of Iceberg. - -Iceberg format has full upward and downward compatibility features, and in general, users do not have to worry about the compatibility of the Iceberg version used by the engine client with the Iceberg version on which Amoro depends. - -Amoro supports all catalog types supported by Iceberg, including but not limited to: Hadoop, Hive, Glue, JDBC, Nessie, Snowflake, and so on. - -Amoro supports all storage types supported by Iceberg, including but not limited to: Hadoop, S3, AliyunOSS, GCS, ECS, and so on. diff --git a/amoro-docs/content/formats/mixed-hive.md b/amoro-docs/content/formats/mixed-hive.md deleted file mode 100644 index 6b7e6ee..0000000 --- a/amoro-docs/content/formats/mixed-hive.md +++ /dev/null @@ -1,49 +0,0 @@ ---- -title: "Mixed-Hive" -url: mixed-hive-format -aliases: - - "formats/mixed-hive" -menu: - main: - parent: Formats - weight: 400 ---- - -# Mixed-Hive Format - -Mixed-Hive format is a format that has better compatibility with Hive than Mixed-Iceberg format. -Mixed-Hive format uses a Hive table as the BaseStore and an Iceberg table as the ChangeStore. Mixed-Hive format supports: -- schema, partition, and types consistent with Hive format -- Using the Hive connector to read and write Mixed-Hive format tables as Hive tables -- Upgrading a Hive table in-place to a Mixed-Hive format table without data rewriting or migration, with a response time in seconds -- All the functional features of Mixed-Iceberg format - -The structure of Mixed-Hive format is shown below: - -![Mixed-Hive format](../images/formats/mixed_hive_format.png) - -In the BaseStore, files under the Hive location are also indexed by the Iceberg manifest, avoiding data redundancy between the two formats. -Mixed-Hive format combines the snapshot, ACID, and MVCC features of Iceberg, and provides a great degree of compatibility with Hive, offering flexible selection and extension options for data platforms, processes, and products built around Hive format in the past. - -{{< hint info >}} -The freshness of data under the Hive location is guaranteed by Full optimizing. -Therefore, the timeliness of native Hive reads is significantly different from that of Mixed-Iceberg tables. -It is recommended to use Merge-on-read to read data with freshness in the order of minutes in Mixed-Hive format. -{{< /hint >}} - - diff --git a/amoro-docs/content/formats/mixed-iceberg.md b/amoro-docs/content/formats/mixed-iceberg.md deleted file mode 100644 index 30eb633..0000000 --- a/amoro-docs/content/formats/mixed-iceberg.md +++ /dev/null @@ -1,93 +0,0 @@ ---- -title: "Mixed-Iceberg" -url: mixed-iceberg-format -aliases: - - "formats/mixed-iceberg" -menu: - main: - parent: Formats - weight: 300 ---- - -# Mixed-Iceberg Format - -Compared with Iceberg format, Mixed-Iceberg format provides more features: -- Stronger primary key constraints that also apply to Spark -- OLAP performance that is production-ready for real-time data warehouses through the auto-bucket mechanism -- LogStore configuration that can reduce data pipeline latency from minutes to milliseconds/seconds -- Transaction conflict resolution mechanism that enables concurrent writes with the same primary key - -The design intention of Mixed-Iceberg format is to provide a storage layer for stream-batch integration and offline-real-time unified data warehouses for big data platforms based on data lakes. -Under this goal-driven approach, Amoro designs Mixed-Iceberg format as a three-tier structure, with each level named after a different TableStore: - -![Mixed-Iceberg format](../images/formats/mixed_format.png) - -- BaseStore — stores the stock data of the table, usually generated by batch computing or optimizing processes, and is more friendly to ReadStore for reading. -- ChangeStore — stores the flow and change data of the table, usually written in real-time by streaming computing, and can also be used for downstream CDC consumption, and is more friendly to WriteStore for writing. -- LogStore — serves as a cache layer for ChangeStore to accelerate stream processing. Amoro manages the consistency between LogStore and ChangeStore. - -The design philosophy of TableStore in Mixed-Iceberg format is similar to that of clustered indexes in databases. Each TableStore can use different table formats. Mixed-Iceberg format provides high freshness OLAP through merge-on-read between BaseStore and ChangeStore. -To provide high-performance merge-on-read, BaseStore and ChangeStore use completely consistent partition and layout, and both support auto-bucket. - -The Auto-bucket feature helps the self-optimizing process control the file size of BaseStore within the target-size, and dynamically scale the data volume through bucket splitting and merging while maintaining the base file size as much as possible. -Auto-bucket divides the data under a partition into sets of non-intersecting primary keys in a hash-based manner, greatly reducing the amount of data that needs to be scanned during optimizing and merge-on-read, and improving performance. For more details, please refer to [benchmark](../../../benchmark-report/) - -The auto-bucket feature of the Mixed-Iceberg format references the paper: [Scalable, Distributed Data Structures for Internet Service Construction](https://people.eecs.berkeley.edu/~culler/papers/dds.pdf) - -There are some limitations in using the Mixed-Iceberg format: - -- Compatibility limited — In scenarios where Hive and Iceberg are compatible, there may be a violation of primary key uniqueness or the failure of conflict resolution. -- Primary key constraint — When the primary key does not include partition keys and there are no updates to the stream data, normalized operators or other methods need to be used to restore the previous data to ensure primary key uniqueness. -- Engines integrated — Currently supports reading and writing with Flink and Spark, and querying data with Trino. - -The BaseStore and ChangeStore of the Mixed-Iceberg format both use the Iceberg format and are consistent with Iceberg in schema, types, and partition usage. -While possessing the features of the Mixed-Iceberg format, the BaseStore and ChangeStore can be read and written using the native Iceberg connector, thus having all the functional features of the Iceberg format. -Taking Spark as an example, this paper describes how to operate on the Mixed-Iceberg format table created by Quick demo using the Iceberg connector. We can use the following command to open a Spark SQL client: - -```shell -spark-sql --packages org.apache.Iceberg:Iceberg-spark-runtime-3.2_2.12:0.14.0\ - --conf spark.sql.extensions=org.apache.Iceberg.spark.extensions.IcebergSparkSessionExtensions \ - --conf spark.sql.catalog.local=org.apache.Iceberg.spark.SparkCatalog \ - --conf spark.sql.catalog.local.type=hadoop \ - --conf spark.sql.catalog.local.warehouse=/tmp/Amoro/warehouse -``` - -After that, we can use the following command to read from or write to the Iceberg tables managed by Amoro: - -```shell --- Switch to Iceberg catalog -use local; - --- Show all Iceberg tables -show tables; - --- Query BaseStore -select * from local.test_db.test_table.base; - --- Query ChangeStore -select * from local.test_db.test_table.change; - --- Insert BaseStore -insert into local.test_db.test_table.base value(10, 'tony', timestamp('2022-07-03 12:10:30')); -``` - -More Iceberg-compatible usage can be found in the [Iceberg docs](https://Iceberg.apache.org/docs/latest/). - -{{< hint info >}} -The Minor optimizing feature of Amoro generally ensures that the data freshness of the Iceberg BaseStore is maintained at the minute level. -{{< /hint >}} \ No newline at end of file diff --git a/amoro-docs/content/formats/overview.md b/amoro-docs/content/formats/overview.md deleted file mode 100644 index 939a49b..0000000 --- a/amoro-docs/content/formats/overview.md +++ /dev/null @@ -1,47 +0,0 @@ ---- -title: "Overview" -url: formats-overview -aliases: - - "formats/overview" -menu: - main: - parent: Formats - weight: 100 ---- - -# Formats Overview - -Table format (aka. format) was first proposed by Iceberg, which can be described as follows: - -- It defines the relationship between tables and files, and any engine can query and retrieve data files according to the table format. -- New formats such as Iceberg/Delta/Hudi further define the relationship between tables and snapshots, and the relationship between snapshots and files. - All write operations on the table will generate new snapshots, and all read operations on the table are based on snapshots. - Snapshots bring MVCC, ACID, and Transaction capabilities to data lakes. - -In addition, new table formats such as [Iceberg](https://Iceberg.apache.org/) also provide many advanced features such as schema evolve, hidden partition, and data skip. -[Hudi](https://hudi.apache.org/) and [Delta](https://delta.io/) may have some differences in specific functions, but we see that the standard of table formats is gradually established with the functional convergence of these three open-source projects in the past two years. - -For users, the design goal of Amoro is to provide an out-of-the-box data lake system. Internally, Amoro's design philosophy is to use different table formats as storage engines for data lakes. -This design pattern is more common in open-source systems such as MySQL and ClickHouse. - -Currently, Amoro mainly provides the following four table formats: - -- **Iceberg format:** Users can directly entrust their Iceberg tables to Amoro for maintenance, so that users can not only use all the functions of Iceberg tables, but also enjoy the performance and stability improvements brought by Amoro. -- **Mixed-Iceberg format:** Amoro provides a set of more optimized formats for streaming update scenarios on top of the Iceberg format. If users have high performance requirements for streaming updates or have demands for CDC incremental data reading functions, they can choose to use the Mixed-Iceberg format. -- **Mixed-Hive format:** Many users do not want to affect the business originally built on Hive while using data lakes. Therefore, Amoro provides the Mixed-Hive format, which can upgrade Hive tables to Mixed-Hive format only through metadata migration, and the original Hive tables can still be used normally. This ensures business stability and benefits from the advantages of data lake computing. -- **Paimon format:** Amoro supports displaying metadata information in the Paimon format, including Schema, Options, Files, Snapshots, DDLs, and Compaction information. diff --git a/amoro-docs/content/formats/paimon.md b/amoro-docs/content/formats/paimon.md deleted file mode 100644 index 6c24a9e..0000000 --- a/amoro-docs/content/formats/paimon.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: "Paimon" -url: paimon-format -aliases: - - "formats/paimon" -menu: - main: - parent: Formats - weight: 200 ---- - -# Paimon Format - -Paimon format refers to [Apache Paimon](https://paimon.apache.org/) table. -Paimon is a streaming data lake platform with high-speed data ingestion, changelog tracking and efficient real-time analytics. - -By registering Paimon's catalog with Amoro, users can view information such as Schema, Options, Files, Snapshots, DDLs, Compaction information, and more for Paimon tables. -Furthermore, they can operate on Paimon tables using Spark SQL in the Terminal. The current supported catalog types and file system types for Paimon are all supported. - -For registering catalog operation steps, please refer to [Managing Catalogs](../managing-catalogs/). - -{{< hint info >}} -If you want to use S3 or OSS, please download the -[S3](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-s3/0.5.0-incubating/paimon-s3-0.5.0-incubating.jar), -[OSS](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-oss/0.5.0-incubating/paimon-oss-0.5.0-incubating.jar) -package and put it in the 'lib' directory of the Amoro installation package. -{{< /hint >}} diff --git a/amoro-docs/content/images/admin/add-optimizer-group.png b/amoro-docs/content/images/admin/add-optimizer-group.png deleted file mode 100644 index acfdb92..0000000 Binary files a/amoro-docs/content/images/admin/add-optimizer-group.png and /dev/null differ diff --git a/amoro-docs/content/images/admin/config-optimizer-group.png b/amoro-docs/content/images/admin/config-optimizer-group.png deleted file mode 100644 index 0c96184..0000000 Binary files a/amoro-docs/content/images/admin/config-optimizer-group.png and /dev/null differ diff --git a/amoro-docs/content/images/admin/create-catalog.png b/amoro-docs/content/images/admin/create-catalog.png deleted file mode 100644 index 3ebe216..0000000 Binary files a/amoro-docs/content/images/admin/create-catalog.png and /dev/null differ diff --git a/amoro-docs/content/images/admin/create-rest_catalog.png b/amoro-docs/content/images/admin/create-rest_catalog.png deleted file mode 100644 index b8b41bb..0000000 Binary files a/amoro-docs/content/images/admin/create-rest_catalog.png and /dev/null differ diff --git a/amoro-docs/content/images/admin/delete_catalog.png b/amoro-docs/content/images/admin/delete_catalog.png deleted file mode 100644 index 2d94a56..0000000 Binary files a/amoro-docs/content/images/admin/delete_catalog.png and /dev/null differ diff --git a/amoro-docs/content/images/admin/hive-table-detail.png b/amoro-docs/content/images/admin/hive-table-detail.png deleted file mode 100644 index 77a2ebd..0000000 Binary files a/amoro-docs/content/images/admin/hive-table-detail.png and /dev/null differ diff --git a/amoro-docs/content/images/admin/hive-table-upgrade.png b/amoro-docs/content/images/admin/hive-table-upgrade.png deleted file mode 100644 index 7705ecd..0000000 Binary files a/amoro-docs/content/images/admin/hive-table-upgrade.png and /dev/null differ diff --git a/amoro-docs/content/images/admin/optimizer_create.png b/amoro-docs/content/images/admin/optimizer_create.png deleted file mode 100644 index d9b8350..0000000 Binary files a/amoro-docs/content/images/admin/optimizer_create.png and /dev/null differ diff --git a/amoro-docs/content/images/admin/optimizer_metrics.png b/amoro-docs/content/images/admin/optimizer_metrics.png deleted file mode 100644 index 6bea032..0000000 Binary files a/amoro-docs/content/images/admin/optimizer_metrics.png and /dev/null differ diff --git a/amoro-docs/content/images/admin/optimizer_release.png b/amoro-docs/content/images/admin/optimizer_release.png deleted file mode 100644 index f9a0969..0000000 Binary files a/amoro-docs/content/images/admin/optimizer_release.png and /dev/null differ diff --git a/amoro-docs/content/images/admin/optimizer_scale.png b/amoro-docs/content/images/admin/optimizer_scale.png deleted file mode 100644 index 89f983d..0000000 Binary files a/amoro-docs/content/images/admin/optimizer_scale.png and /dev/null differ diff --git a/amoro-docs/content/images/admin/table_metrics.png b/amoro-docs/content/images/admin/table_metrics.png deleted file mode 100644 index 7ad46c7..0000000 Binary files a/amoro-docs/content/images/admin/table_metrics.png and /dev/null differ diff --git a/amoro-docs/content/images/admin/table_optimizer_history.png b/amoro-docs/content/images/admin/table_optimizer_history.png deleted file mode 100644 index dcb4e97..0000000 Binary files a/amoro-docs/content/images/admin/table_optimizer_history.png and /dev/null differ diff --git a/amoro-docs/content/images/admin/terminal_introduce.png b/amoro-docs/content/images/admin/terminal_introduce.png deleted file mode 100644 index d6549a5..0000000 Binary files a/amoro-docs/content/images/admin/terminal_introduce.png and /dev/null differ diff --git a/amoro-docs/content/images/admin/watermark_table_detail.png b/amoro-docs/content/images/admin/watermark_table_detail.png deleted file mode 100644 index eda1e92..0000000 Binary files a/amoro-docs/content/images/admin/watermark_table_detail.png and /dev/null differ diff --git a/amoro-docs/content/images/concepts/freshness_cost_performance.png b/amoro-docs/content/images/concepts/freshness_cost_performance.png deleted file mode 100644 index f13efa6..0000000 Binary files a/amoro-docs/content/images/concepts/freshness_cost_performance.png and /dev/null differ diff --git a/amoro-docs/content/images/concepts/iceberg_format.png b/amoro-docs/content/images/concepts/iceberg_format.png deleted file mode 100644 index b1a1be1..0000000 Binary files a/amoro-docs/content/images/concepts/iceberg_format.png and /dev/null differ diff --git a/amoro-docs/content/images/concepts/major_optimizing.png b/amoro-docs/content/images/concepts/major_optimizing.png deleted file mode 100644 index 346810f..0000000 Binary files a/amoro-docs/content/images/concepts/major_optimizing.png and /dev/null differ diff --git a/amoro-docs/content/images/concepts/minor_optimizing.png b/amoro-docs/content/images/concepts/minor_optimizing.png deleted file mode 100644 index 107ade1..0000000 Binary files a/amoro-docs/content/images/concepts/minor_optimizing.png and /dev/null differ diff --git a/amoro-docs/content/images/concepts/mixed_format.png b/amoro-docs/content/images/concepts/mixed_format.png deleted file mode 100644 index c223893..0000000 Binary files a/amoro-docs/content/images/concepts/mixed_format.png and /dev/null differ diff --git a/amoro-docs/content/images/concepts/mixed_hive_format.png b/amoro-docs/content/images/concepts/mixed_hive_format.png deleted file mode 100644 index 05a717f..0000000 Binary files a/amoro-docs/content/images/concepts/mixed_hive_format.png and /dev/null differ diff --git a/amoro-docs/content/images/concepts/quota-occupation.png b/amoro-docs/content/images/concepts/quota-occupation.png deleted file mode 100644 index 1888170..0000000 Binary files a/amoro-docs/content/images/concepts/quota-occupation.png and /dev/null differ diff --git a/amoro-docs/content/images/concepts/self-optimizing_arch.png b/amoro-docs/content/images/concepts/self-optimizing_arch.png deleted file mode 100644 index 796c4db..0000000 Binary files a/amoro-docs/content/images/concepts/self-optimizing_arch.png and /dev/null differ diff --git a/amoro-docs/content/images/favicon.ico b/amoro-docs/content/images/favicon.ico deleted file mode 100644 index 38f9f47..0000000 Binary files a/amoro-docs/content/images/favicon.ico and /dev/null differ diff --git a/amoro-docs/content/images/flink/double-write.png b/amoro-docs/content/images/flink/double-write.png deleted file mode 100644 index 3e4ae3c..0000000 Binary files a/amoro-docs/content/images/flink/double-write.png and /dev/null differ diff --git a/amoro-docs/content/images/formats/iceberg_format.png b/amoro-docs/content/images/formats/iceberg_format.png deleted file mode 100644 index b1a1be1..0000000 Binary files a/amoro-docs/content/images/formats/iceberg_format.png and /dev/null differ diff --git a/amoro-docs/content/images/formats/mixed_format.png b/amoro-docs/content/images/formats/mixed_format.png deleted file mode 100644 index 10cbea6..0000000 Binary files a/amoro-docs/content/images/formats/mixed_format.png and /dev/null differ diff --git a/amoro-docs/content/images/formats/mixed_hive_format.png b/amoro-docs/content/images/formats/mixed_hive_format.png deleted file mode 100644 index 9490395..0000000 Binary files a/amoro-docs/content/images/formats/mixed_hive_format.png and /dev/null differ diff --git a/amoro-docs/content/images/introduce_amoro.png b/amoro-docs/content/images/introduce_amoro.png deleted file mode 100644 index 8209a11..0000000 Binary files a/amoro-docs/content/images/introduce_amoro.png and /dev/null differ diff --git a/amoro-docs/content/user-guides/cdc-ingestion.md b/amoro-docs/content/user-guides/cdc-ingestion.md deleted file mode 100644 index 280af83..0000000 --- a/amoro-docs/content/user-guides/cdc-ingestion.md +++ /dev/null @@ -1,60 +0,0 @@ ---- -title: "CDC Ingestion" -url: cdc-ingestion -aliases: - - "user-guides/cdc-ingestion" -menu: - main: - parent: User Guides - weight: 400 ---- - -# CDC Ingestion -CDC stands for Change Data Capture, which is a broad concept, as long as it can capture the change data, it can be called CDC. -[Flink CDC](https://github.com/apache/flink-cdc) is a Log message-based data capture tool, all the inventory -and incremental data can be captured. Taking MySQL as an example, it can easily capture Binlog data through -[Debezium](https://debezium.io/)、[Flink CDC](https://github.com/apache/flink-cdc) and process the calculations in real time to send them to the data lake. The data lake can then be -queried by other engines. - -This section will show how to ingest one table or multiple tables into the data lake for both [Iceberg](../iceberg-format/) format and [Mixed-Iceberg](../mixed-iceberg-format/) format. -## Apache Flink CDC - -[**Apache Flink CDC**](https://nightlies.apache.org/flink/flink-cdc-docs-stable/) is a distributed data integration -tool for real time data and batch data. Flink CDC brings the -simplicity and elegance of data integration via YAML to describe the data movement and transformation. - -Amoro provides the relevant code case reference how to complete cdc data to different lakehouse table format, see -[**flink-cdc-ingestion**](../flink-cdc-ingestion) doc - -At the same time, we provide [**Mixed-Iceberg**](../iceberg-format) format, which you can understand as -**STREAMING** For iceberg, which will enhance your real-time processing scene for you - -## Debezium - -Debezium is an open source distributed platform for change data capture. Start it up, point it at your databases, and your apps can start responding to all of the inserts, updates, and deletes that other apps commit to your databases. Debezium is durable and fast, so your apps can respond quickly and never miss an event, even when things go wrong. - -### Demo - -Coming Soon - -## Airbyte - -Airbyte is Data integration platform for ELT pipelines from APIs, databases & files to databases, warehouses & lakes - -### Demo -Coming Soon diff --git a/amoro-docs/content/user-guides/configurations.md b/amoro-docs/content/user-guides/configurations.md deleted file mode 100644 index 5fcde88..0000000 --- a/amoro-docs/content/user-guides/configurations.md +++ /dev/null @@ -1,154 +0,0 @@ ---- -title: "Configurations" -url: configurations -aliases: - - "user-guides/configurations" -menu: - main: - parent: User Guides - weight: 300 ---- - -# Table Configurations - -## Multi-level configuration management - -Amoro provides configurations that can be configured at the `Catalog`, `Table`, and `Engine` levels. The configuration -priority is given first to the `Engine`, followed by the `Table`, and finally by the `Catalog`. - -- Catalog: Generally, we recommend -users to set default values for tables through the [Catalog properties configuration](../managing-catalogs/#configure-properties), such as Self-optimizing related configurations. -- Table: We also recommend users to -specify customized configurations when [Create Table](../using-tables/#create-table), which can also be -modified through [Alter Table](../using-tables/#modify-table) operations. -- Engine: If tuning is required in the engines, then consider configuring it at the engine level, refer to -[Spark](../spark-configuration/) and [Flink](../flink-dml/). - -## Self-optimizing configurations - -Self-optimizing configurations are applicable to both Iceberg Format and Mixed streaming Format. - -| Key | Default | Description | -|-----------------------------------------------|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| self-optimizing.enabled | true | Enables Self-optimizing | -| self-optimizing.group | default | Optimizer group for Self-optimizing | -| self-optimizing.quota | 0.1 | Quota for Self-optimizing, indicating the CPU resource the table can take up | -| self-optimizing.execute.num-retries | 5 | Number of retries after failure of Self-optimizing | -| self-optimizing.target-size | 134217728(128MB) | Target size for Self-optimizing | -| self-optimizing.max-file-count | 10000 | Maximum number of files processed by a Self-optimizing process | -| self-optimizing.max-task-size-bytes | 134217728(128MB) | Maximum file size bytes in a single task for splitting tasks | -| self-optimizing.fragment-ratio | 8 | The fragment file size threshold. We could divide self-optimizing.target-size by this ratio to get the actual fragment file size | -| self-optimizing.min-target-size-ratio | 0.75 | The undersized segment file size threshold. Segment files under this threshold will be considered for rewriting | -| self-optimizing.minor.trigger.file-count | 12 | The minimum number of files to trigger minor optimizing is determined by the sum of fragment file count and equality delete file count | -| self-optimizing.minor.trigger.interval | 3600000(1 hour) | The time interval in milliseconds to trigger minor optimizing | -| self-optimizing.major.trigger.duplicate-ratio | 0.1 | The ratio of duplicate data of segment files to trigger major optimizing | -| self-optimizing.full.trigger.interval | -1(closed) | The time interval in milliseconds to trigger full optimizing | -| self-optimizing.full.rewrite-all-files | true | Whether full optimizing rewrites all files or skips files that do not need to be optimized | -| self-optimizing.min-plan-interval | 60000 | The minimum time interval between two self-optimizing planning action | -| self-optimizing.filter | NULL | Filter conditions for self-optimizing, using SQL conditional expressions, without supporting any functions. For the timestamp column condition, the ISO date-time formatter must be used. For example: op_time > '2007-12-03T10:15:30'. | - -## Data-cleaning configurations - -Data-cleaning configurations are applicable to both Iceberg Format and Mixed streaming Format. - -| Key | Default | Description | -|---------------------------------------------|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| table-expire.enabled | true | Enables periodically expire table | -| change.data.ttl.minutes | 10080(7 days) | Time to live in minutes for data of ChangeStore | -| snapshot.keep.duration | 720min(12 hours) | Table-Expiration keeps the latest snapshots within a specified duration | -| snapshot.keep.min-count | 1 | Minimum number of snapshots retained for table expiration | -| clean-orphan-file.enabled | false | Enables periodically clean orphan files | -| clean-orphan-file.min-existing-time-minutes | 2880(2 days) | Cleaning orphan files keeps the files modified within a specified time in minutes | -| clean-dangling-delete-files.enabled | true | Whether to enable cleaning of dangling delete files | -| data-expire.enabled | false | Whether to enable data expiration | -| data-expire.level | partition | Level of data expiration. Including partition and file | -| data-expire.field | NULL | Field used to determine data expiration, supporting timestamp/timestampz/long type and string type field in date format | -| data-expire.datetime-string-pattern | yyyy-MM-dd | Pattern used for matching string datetime | -| data-expire.datetime-number-format | TIMESTAMP_MS | Timestamp unit for long field. Including TIMESTAMP_MS and TIMESTAMP_S | -| data-expire.retention-time | NULL | Retention period for data expiration. For example, 1d means retaining data for 1 day. Other supported units include h (hour), min (minute), s (second), ms (millisecond), etc. | -| data-expire.base-on-rule | LAST_COMMIT_TIME | A rule to indicate how to start expire data. Including LAST_COMMIT_TIME and CURRENT_TIME. LAST_COMMIT_TIME uses the timestamp of latest commit snapshot which is not optimized as the start of the expiration, which ensures that the table has `retention-time` data | - -## Tags configurations - -Tags configurations are applicable to Iceberg Format only now, and will be supported in Mixed Format -soon. - -| Key | Default | Description | -|-------------------------------------------|------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------| -| tag.auto-create.enabled | false | Enables automatically creating tags | -| tag.auto-create.trigger.period | daily | Period of creating tags, support `daily`,`hourly` now | -| tag.auto-create.trigger.offset.minutes | 0 | The minutes by which the tag is created after midnight (00:00) | -| tag.auto-create.trigger.max-delay.minutes | 60 | The maximum delay time for creating a tag | -| tag.auto-create.tag-format | 'tag-'yyyyMMdd for daily and 'tag-'yyyyMMddHH for hourly periods | The format of the name for tag. Modifying this configuration will not take effect on old tags | -| tag.auto-create.max-age-ms | -1 | Time of automatically created Tag to retain, -1 means keep it forever. Modifying this configuration will not take effect on old tags | - -## Mixed Format configurations - -If using Iceberg Format,please refer to [Iceberg configurations](https://iceberg.apache.org/docs/latest/configuration/),the following configurations are only applicable to Mixed Format. - -### Reading configurations - -| Key | Default | Description | -| ---------------------------------- | ---------------- | ---------------------------------- | -| read.split.open-file-cost | 4194304(4MB) | The estimated cost to open a file | -| read.split.planning-lookback | 10 | Number of bins to consider when combining input splits | -| read.split.target-size | 134217728(128MB)| Target size when combining data input splits | -| read.split.delete-ratio | 0.05 | When the ratio of delete files is below this threshold, the read task will be split into more tasks to improve query speed | - -### Writing configurations - -| Key | Default | Description | -|-------------------------------|------------------|-----------------------------------------------------------------------------------------------------------------| -| base.write.format | parquet | File format for the table for BaseStore, applicable to KeyedTable | -| change.write.format | parquet | File format for the table for ChangeStore, applicable to KeyedTable | -| write.format.default | parquet | Default file format for the table, applicable to UnkeyedTable | -| base.file-index.hash-bucket | 4 | Initial number of buckets for BaseStore auto-bucket | -| change.file-index.hash-bucket | 4 | Initial number of buckets for ChangeStore auto-bucket | -| write.target-file-size-bytes | 134217728(128MB) | Target size when writing | -| write.upsert.enabled | false | Enable upsert mode, multiple insert data with the same primary key will be merged if enabled | -| write.distribution-mode | hash | Shuffle rules for writing. UnkeyedTable can choose between none and hash, while KeyedTable can only choose hash | -| write.distribution.hash-mode | auto | Auto-bucket mode, which supports primary-key, partition-key, primary-partition-key, and auto | -| base.refresh-interval | -1 (Closed) | The interval for refreshing the BaseStore | - -### LogStore configurations - -| Key | Default | Description | -|-----------------------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| log-store.enabled | false | Enables LogStore | -| log-store.type | kafka | Type of LogStore, which supports 'kafka' and 'pulsar' | -| log-store.address | NULL | Address of LogStore, required when LogStore enabled. For Kafka, this is the Kafka bootstrap servers. For Pulsar, this is the Pulsar Service URL, such as 'pulsar://localhost:6650' | -| log-store.topic | NULL | Topic of LogStore, required when LogStore enabled | -| properties.pulsar.admin.adminUrl | NULL | HTTP URL of Pulsar admin, such as 'http://my-broker.example.com:8080'. Only required when log-store.type=pulsar | -| properties.XXX | NULL | Other configurations of LogStore.

For Kafka, all the configurations supported by Kafka Consumer/Producer can be set by prefixing them with `properties.`,
such as `'properties.batch.size'='16384'`,
refer to [Kafka Consumer Configurations](https://kafka.apache.org/documentation/#consumerconfigs), [Kafka Producer Configurations](https://kafka.apache.org/documentation/#producerconfigs) for more details.

For Pulsar,all the configurations supported by Pulsar can be set by prefixing them with `properties.`,
such as `'properties.pulsar.client.requestTimeoutMs'='60000'`,
refer to [Flink-Pulsar-Connector](https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/connectors/datastream/pulsar) for more details | - -### Watermark configurations - -| Key | Default | Description | -| ----------------------------------------------| ------------------- | ---------------------------------- | -| table.event-time-field | _ingest_time | The event time field for calculating the watermark. The default `_ingest_time` indicates calculating with the time when the data was written | -| table.watermark-allowed-lateness-second | 0 | The allowed lateness time in seconds when calculating watermark | -| table.event-time-field.datetime-string-format | `yyyy-MM-dd HH:mm:ss` | The format of event time when it is in string format | -| table.event-time-field.datetime-number-format | TIMESTAMP_MS | The format of event time when it is in numeric format, which supports TIMESTAMP_MS (timestamp in milliseconds) and TIMESTAMP_S (timestamp in seconds)| - -### Mixed-Hive format configurations - -| Key | Default | Description | -|-----------------------------------|------------------|--------------------------------------------------------------------------------------------------------| -| base.hive.auto-sync-schema-change | true | Whether synchronize schema changes of Hive Table from HMS | -| base.hive.auto-sync-data-write | false | Whether synchronize data changes of Hive Table from HMS, this should be true when writing to Hive | -| base.hive.consistent-write.enabled | true | To avoid writing dirty data, the files written to the Hive directory will be hidden files and renamed to visible files upon commit. | diff --git a/amoro-docs/content/user-guides/metrics.md b/amoro-docs/content/user-guides/metrics.md deleted file mode 100644 index 22855f9..0000000 --- a/amoro-docs/content/user-guides/metrics.md +++ /dev/null @@ -1,121 +0,0 @@ ---- -title: "Metrics" -url: metrics -aliases: - - "user-guides/metrics" -menu: - main: - parent: User Guides - weight: 500 ---- - -# Metrics - -Amoro build a metrics system to measure the behaviours of table management processes, like how long has it been since a table last performed self-optimizing process, and how much resources does a optimizer group currently has? - -There are two types of metrics provided in the Amoro metric system: Gauge and Counter. - -* Gauge: Provides a value of any type at a point in time. -* Counter: Used to count values by incrementing and decrementing. - -Amoro has supported built-in metrics to measure status of table self-optimizing processes and optimizer resources, which can be [reported to external metric system like Prometheus etc](../deployment/#configure-metric-reporter). - -## Self-optimizing metrics - -| Metric Name | Type | Tags | Description | -|------------------------------------------------------|---------|--------------------------|----------------------------------------------------------------------------------------| -| table_optimizing_status_idle_duration_mills | Gauge | catalog, database, table | Duration in milliseconds after table be in idle status | -| table_optimizing_status_pending_duration_mills | Gauge | catalog, database, table | Duration in milliseconds after table be in pending status | -| table_optimizing_status_planning_duration_mills | Gauge | catalog, database, table | Duration in milliseconds after table be in planning status | -| table_optimizing_status_executing_duration_mills | Gauge | catalog, database, table | Duration in milliseconds after table be in executing status | -| table_optimizing_status_committing_duration_mills | Gauge | catalog, database, table | Duration in milliseconds after table be in committing status | -| table_optimizing_process_total_count | Counter | catalog, database, table | Count of all optimizing process since ams started | -| table_optimizing_process_failed_count | Counter | catalog, database, table | Count of failed optimizing process since ams started | -| table_optimizing_minor_total_count | Counter | catalog, database, table | Count of minor optimizing process since ams started | -| table_optimizing_minor_failed_count | Counter | catalog, database, table | Count of failed minor optimizing process since ams started | -| table_optimizing_major_total_count | Counter | catalog, database, table | Count of major optimizing process since ams started | -| table_optimizing_major_failed_count | Counter | catalog, database, table | Count of failed major optimizing process since ams started | -| table_optimizing_full_total_count | Counter | catalog, database, table | Count of full optimizing process since ams started | -| table_optimizing_full_failed_count | Counter | catalog, database, table | Count of failed full optimizing process since ams started | -| table_optimizing_status_in_idle | Gauge | catalog, database, table | If currently table is in idle status | -| table_optimizing_status_in_pending | Gauge | catalog, database, table | If currently table is in pending status | -| table_optimizing_status_in_planning | Gauge | catalog, database, table | If currently table is in planning status | -| table_optimizing_status_in_executing | Gauge | catalog, database, table | If currently table is in executing status | -| table_optimizing_status_in_committing | Gauge | catalog, database, table | If currently table is in committing status | -| table_optimizing_since_last_minor_optimization_mills | Gauge | catalog, database, table | Duration in milliseconds since last successful minor optimization | -| table_optimizing_since_last_major_optimization_mills | Gauge | catalog, database, table | Duration in milliseconds since last successful major optimization | -| table_optimizing_since_last_full_optimization_mills | Gauge | catalog, database, table | Duration in milliseconds since last successful full optimization | -| table_optimizing_since_last_optimization_mills | Gauge | catalog, database, table | Duration in milliseconds since last successful optimization | -| table_optimizing_lag_duration_mills | Gauge | catalog, database, table | Duration in milliseconds between last self-optimizing snapshot and refreshed snapshot | - -## Optimizer Group metrics - -| Metric Name | Type | Tags | Description | -|-----------------------------------------|--------|-------|--------------------------------------------------| -| optimizer_group_pending_tasks | Gauge | group | Number of pending tasks in optimizer group | -| optimizer_group_executing_tasks | Gauge | group | Number of executing tasks in optimizer group | -| optimizer_group_planing_tables | Gauge | group | Number of planing tables in optimizer group | -| optimizer_group_pending_tables | Gauge | group | Number of pending tables in optimizer group | -| optimizer_group_executing_tables | Gauge | group | Number of executing tables in optimizer group | -| optimizer_group_idle_tables | Gauge | group | Number of idle tables in optimizer group | -| optimizer_group_committing_tables | Gauge | group | Number of committing tables in optimizer group | -| optimizer_group_optimizer_instances | Gauge | group | Number of optimizer instances in optimizer group | -| optimizer_group_memory_bytes_allocated | Gauge | group | Memory bytes allocated in optimizer group | -| optimizer_group_threads | Gauge | group | Number of total threads in optimizer group | - -## Orphan Files Cleaning metrics - -| Metric Name | Type | Tags | Description | -|----------------------------------------------------|---------|--------------------------|--------------------------------------------------------------------------------| -| table_orphan_content_file_cleaning_count | Counter | catalog, database, table | Count of orphan content files cleaned in the table since ams started | -| table_orphan_metadata_file_cleaning_count | Counter | catalog, database, table | Count of orphan metadata files cleaned in the table since ams started | -| table_expected_orphan_content_file_cleaning_count | Counter | catalog, database, table | Expected Count of orphan content files cleaned in the table since ams started | -| table_expected_orphan_metadata_file_cleaning_count | Counter | catalog, database, table | Expected Count of orphan metadata files cleaned in the table since ams started | - - -## Ams service metrics -| Metric Name | Type | Tags | Description | -|--------------------------------------------------------|--------|-----------------|------------------------------------------------------------------| -| ams_jvm_cpu_load | Gauge | | The recent CPU usage of the AMS | -| ams_jvm_cpu_time | Gauge | | The CPU time used by the AMS | -| ams_jvm_memory_heap_used | Gauge | | The amount of heap memory currently used (in bytes) by the AMS | -| ams_jvm_memory_heap_committed | Gauge | | The amount of memory in the heap committed for JVM use (bytes) | -| ams_jvm_memory_heap_max | Gauge | | The maximum heap memory (bytes), set by -Xmx JVM argument | -| ams_jvm_threads_count | Gauge | | The total number of live threads used by the AMS | -| ams_jvm_garbage_collector_count | Gauge |garbage_collector| The count of the JVM's Garbage Collector, such as G1 Young | -| ams_jvm_garbage_collector_time | Gauge |garbage_collector| The time spent by the JVM's Garbage Collector, such as G1 Young | - -## table summary metrics - -| Metric Name | Type | Tags | Description | -|-----------------------------------------------|---------|--------------------------|-----------------------------------------------| -| table_summary_total_files | Gauge | catalog, database, table | Total number of files in the table | -| table_summary_data_files | Gauge | catalog, database, table | Number of data files in the table | -| table_summary_equality_delete_files | Gauge | catalog, database, table | Number of equality delete files in the table | -| table_summary_position_delete_files | Gauge | catalog, database, table | Number of position delete files in the table | -| table_summary_dangling_delete_files | Gauge | catalog, database, table | Number of dangling delete files in the table | -| table_summary_total_files_size | Gauge | catalog, database, table | Total size of files in the table | -| table_summary_data_files_size | Gauge | catalog, database, table | Size of data files in the table | -| table_summary_equality_delete_files_size | Gauge | catalog, database, table | Size of equality delete files in the table | -| table_summary_position_delete_files_size | Gauge | catalog, database, table | Size of position delete files in the table | -| table_summary_total_records | Gauge | catalog, database, table | Total records in the table | -| table_summary_data_files_records | Gauge | catalog, database, table | Records of data files in the table | -| table_summary_equality_delete_files_records | Gauge | catalog, database, table | Records of equality delete files in the table | -| table_summary_position_delete_files_records | Gauge | catalog, database, table | Records of position delete files in the table | -| table_summary_snapshots | Gauge | catalog, database, table | Number of snapshots in the table | -| table_summary_health_score | Gauge | catalog, database, table | Health score of the table | \ No newline at end of file diff --git a/amoro-docs/content/user-guides/using-tables.md b/amoro-docs/content/user-guides/using-tables.md deleted file mode 100644 index fd63667..0000000 --- a/amoro-docs/content/user-guides/using-tables.md +++ /dev/null @@ -1,268 +0,0 @@ ---- -title: "Using Tables" -url: using-tables -aliases: - - "user-guides/using-tables" -menu: - main: - parent: User Guides - weight: 100 ---- - -# Using Tables - -The SQL execution tool `Terminal` is provided in AMS dashboard to help users quickly create, modify and delete tables. -It is also available in [Spark](../spark-ddl/) and [Flink](../flink-ddl/) and other engines to manage tables using SQL. - -## Create table -After logging into AMS dashboard, go to `Terminal`, enter the table creation statement and execute it to complete the table creation. -The following is an example of table creation: - -```sql -create table test_db.test_log_store( - id int, - name string, - op_time timestamp, - primary key(id) -) using mixed_iceberg -partitioned by(days(op_time)) -tblproperties( - 'log-store.enable' = 'true', - 'log-store.type' = 'kafka', - 'log-store.address' = '127.0.0.1:9092', - 'log-store.topic' = 'local_catalog.test_db.test_log_store.log_store', - 'table.event-time-field' = 'op_time', - 'table.watermark-allowed-lateness-second' = '60'); -``` - -Currently, terminal uses Spark Engine for SQL execution. For more information on the syntax of creating tables, refer to [Spark DDL](../spark-ddl/#create-table). Different Catalogs create different table formats, refer to [Create Catalog](../managing-catalogs/#create-catalog) - -### Configure LogStore -As described in [Mixed-Iceberg format](../mixed-iceberg-format/), Mixed-Iceberg format may consist of several components, and BaseStore and ChangeStore will be automatically created upon table creation. -LogStore, as an optional component, requires separate configuration to specify, The complete configuration for LogStore can be found in [LogStore Configuration](../configurations/#logstore-configurations). - -In the example above, the Kafka cluster 127.0.0.1:9092 and the topic local_catalog.test_db.test_log_store.log_store are used as the LogStore for the new table. -Before executing the above statement, you need to manually create the corresponding topic in the Kafka cluster or enable the automatic creation of topics feature for the cluster. - -### Configure watermark -Watermark is used to describe the write progress of a table. Specifically, it is a timestamp attribute on the table, indicating that all data with a timestamp smaller than the watermark has been written to the table. -It is generally used to observe the write progress of a table and can also serve as a trigger metric for downstream batch computing tasks. - - -In the example above, op_time is set as the event time field of the table, and the op_time of the written data is used to calculate the watermark of the table. -To handle out-of-order writes, the permitted lateness of data when calculating the watermark is set to one minute. -You can view the current watermark of the table in the table details on the AMS Dashboard at AMS dashboard. - -![mixed-format-table-watermark](../images/admin/watermark_table_detail.png) - -You can also use the following SQL statement in the `Terminal` to query the watermark of a table: - -```sql -SHOW TBLPROPERTIES test_db.test_log_store ('watermark.table'); -``` - -You can expect to get the following results: - -```text -+-----------------+---------------+ -| key | value | -+-----------------+---------------+ -| watermark.table | 1668579055000 | -+-----------------+---------------+ -``` - -{{< hint info >}} -Watermark configuration is only supported in Mixed-Hive format and Mixed-Iceberg format, and is not supported in Iceberg format for now. -{{< /hint >}} - -## Modify table - -After logging into the AMS dashboard, go to the `Terminal` and enter the -modification statement to complete the table modification. The current `Terminal` uses Spark Engine to execute SQL. For -more information on modifying tables, please refer to the syntax guide [Spark DDL](../spark-ddl#alter-statement). - -## Upgrade a Hive table -Amoro supports [Mixed-Hive format](../mixed-hive-format/) table, which combines the capabilities of Hive formats to directly implement new table formats on top of Hive. - -After logging into the AMS dashboard, select a table under a certain Hive Catalog from the `Tables` menu to perform the upgrade operation. - -![Hive Table Detail](../images/admin/hive-table-detail.png) - -Click the `Upgrade` button in the upper right corner of the table details (this button is not displayed for Hive tables that have already been upgraded). - -![Hive Table Upgrade](../images/admin/hive-table-upgrade.png) - -On the upgrade page, select the primary key for the table and add additional parameters, then click `OK` to complete the upgrade of the Hive table. - -## Configure self-optimizing - -Amoro provides a self-optimizing feature, which requires an active optimizer in the Optimizer Group configured for the table. - -### Modify optimizer group -To use an optimizer launched under a specific optimizer group to perform self-optimizing, you need to modify the `self-optimizing.group` parameter of the table to specify a specific resource pool for the table. -The setting method is as follows: - -```sql -ALTER TABLE test_db.test_log_store set tblproperties ( - 'self-optimizing.group' = 'group_name'); -``` - -In default,`'self-optimizing.group' = 'default'`。 - -### Adjust optimizing resources - -If there are multiple tables to be optimized under the same Optimizer Group, you can manually adjust the resource proportion of each table by adjusting the `quota`. - -```sql -ALTER TABLE test_db.test_log_store set tblproperties ( - 'self-optimizing.quota' = '0.1'); -``` - -For more information, please refer to [Self-optimizing quota](../self-optimizing/#quota)。 - -### Adjust optimizing parameters - -You can manually set parameters such as execution interval, task size, and execution timeout for different types of Optimize. -For example, to set the execution interval for minor optimizing, you can do the following: - -```sql -ALTER TABLE test_db.test_log_store set tblproperties ( - 'self-optimizing.minor.trigger.interval' = '3600000'); -``` - -More optimization parameter adjustment refer to [Self-optimizing configuration](../configurations/#self-optimizing-configurations)。 - -### Enable or disable self-optimizing - -The Optimize of the table is enabled by default. If you want to disable the Optimize feature, execute the following command. -Conversely, you can re-enable it: - -```sql -ALTER TABLE test_db.test_log_store set tblproperties ( - 'self-optimizing.enabled' = 'false'); -``` - -## Configure data expiration - -Amoro can periodically clean data based on the table's expiration policy, which includes properties such as whether to enable expiration, retention duration, expiration level, and the selection of the field for expiration. -it's also necessary for AMS to have the data expiration thread enabled. You can enable the 'data-expiration' property in the configuration file - -### Enable or disable data expiration - -By default, Amoro has data expiration disabled. If you want to enable data expiration, please execute the following command. - -```sql -ALTER TABLE test_db.test_log_store set tblproperties ( - 'data-expire.enabled' = 'true'); -``` - -### Set retention period - -The configuration for data retention duration consists of a number and a unit. For example, '90d' represents retaining data for 90 days, and '12h' indicates 12 hours. - -```sql -ALTER TABLE test_db.test_log_store set tblproperties ( - 'data-expire.retention-time' = '90d'); -``` - -### Select expiration field - -Data expiration requires users to specify a field for determining expiration. -In addition to supporting timestampz/timestamp field types for this purpose, it also supports string and long field type. -String field require a date pattern for proper parsing, with the default format being 'yyyy-MM-dd'. Additionally, long fields can be chosen as the expiration event time, but you need to specify the timestamp's unit, which can be in `TIMESTAMP_MS` or `TIMESTAMP_S`. -Note that timestamp, timestampz, and long field types use UTC, while others use the local time zone. - -```sql -ALTER TABLE test_db.test_log_store set tblproperties ( - 'data-expire.field' = 'op_time'); - --- select string field -ALTER TABLE test_db.test_log_store set tblproperties ( - 'data-expire.field' = 'op_time', - 'data-expire.datetime-string-pattern' = 'yyyy-MM-dd'); - --- select long field -ALTER TABLE test_db.test_log_store set tblproperties ( - 'data-expire.field' = 'op_time', - 'data-expire.datetime-number-format' = 'TIMESTAMP_MS'); -``` - -### Adjust expiration level - -Data expiration supports two levels, including `PARTITION` and `FILE`. The default level is `PARTITION`, which means that AMS deletes files only when all the files within a partition have expired. - -```sql -ALTER TABLE test_db.test_log_store set tblproperties ( - 'data-expire.level' = 'partition'); -``` - -### Specify start time - -Amoro expire data since `LAST_COMMIT_TIME` or `CURRENT_TIME`. `LAST_COMMIT_TIME` will follow the timestamp of the table's most recent snapshot as the start time of the expiration, which ensures that the table has `data-expire.retention-time` data; while `CURRENT_TIME` will follow the current time of the service. - -```sql -ALTER TABLE test_db.test_log_store set tblproperties ( - 'data-expire.base-on-rule' = 'CURRENT_TIME'); -``` - -## Delete table - -After logging into the AMS Dashboard. To modify a table, enter the modification statement in the `terminal` and execute it. - - -Here is an example of how to delete a table: - -```sql -DROP TABLE test_db.test_log_store; -``` - -The current terminal is using the Spark engine to execute SQL. For more information about deleting tables, you can refer to [Spark DDL](../spark-ddl/#drop-table). - -## Explore table details -The Amoro Tables details page provides multiple tabs to display the status of the table from various dimensions, mainly including: - -| **Tab Name** | **Description** | -|--------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Details | Display the table's schema, primary key configuration, partition configuration, properties; as well as the metric information of the files stored in ChangeStore and BaseStore, including the number of files and average file size, as well as the latest submission time of the files. | -| Files | Display all partitions and files of the table. | -| Snapshots | Display all snapshots of the table, which can be filtered by branch and tag. | -| Optimizing | Display all the self-optimizing processes of the table, each record shows the number and average size of files before and after Optimize, as well as the execution time of each process. | -| Operations | Display the current table's DDL historical change records. | - -![table-details](../images/admin/table_metrics.png) - -![table-optimize-history](../images/admin/table_optimizer_history.png) - -## Explore self-optimizing status -The Optimizing page displays self-optimizing status of all tables. -![optimizing-metrics](../images/admin/optimizer_metrics.png) - - -- **Optimizing Status**: The current optimizing status of the table, including idle, pending, planning, minor, major, full, committing. - - idle: means that self-optimizing is not required on the table. - - pending: means that self-optimizing is required on the table and is waiting for resources. - - planning: means that self-optimizing process is being planed. - - minor: means that minor optimizing is being executed on the table. - - major: means that major optimizing is being executed on the table. - - full: means that full optimizing is being executed on the table. - - committing: means that self-optimizing process is being committed. -- **Duration**: The duration of the current status. -- **File Count**: The total number of files involved in the current Self-optimizing, including base, insert, eq-delete, and pos-delete file types. -- **File Size**: The total size of files involved in the current self-optimizing. -- **Quota**: The proportion of self-optimizing execution time executed per unit time. -- **Quota Occupation**: The actual Quota used for self-optimizing during execution of the table in the last hour. When optimizer resources are sufficient and the table requires more resources for self-optimizing, this value will be greater than 100%. When resources are scarce or the table requires fewer resources for self-optimizing, this value will be less than 100%. \ No newline at end of file