Merge pull request #3 from Maxxen/main

Add support for reading/writing `.xlsx` files
duckdb · Dec 6, 2024 · 2e7e415 · 2e7e415
2 parents 0e99dc7 + 871dcf9
commit 2e7e415
Show file tree

Hide file tree

Showing 89 changed files with 13,530 additions and 10,073 deletions.
diff --git a/.clang-format b/.clang-format
@@ -0,0 +1,32 @@
+---
+BasedOnStyle: LLVM
+TabWidth: 4
+IndentWidth: 4
+ColumnLimit: 120
+AllowShortFunctionsOnASingleLine: false
+---
+UseTab: ForIndentation
+DerivePointerAlignment: false
+PointerAlignment: Right
+AlignConsecutiveMacros: true
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AlignAfterOpenBracket: Align
+SpaceBeforeCpp11BracedList: true
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInConditionalStatement: false
+AllowShortLambdasOnASingleLine: Inline
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakTemplateDeclarations: Yes
+IncludeBlocks: Regroup
+Language: Cpp
+AccessModifierOffset: -4
+---
+Language: Java
+SpaceAfterCStyleCast: true
+---
diff --git a/.clang-tidy b/.clang-tidy
@@ -0,0 +1,52 @@
+Checks:          '-*,clang-diagnostic-*,bugprone-*,performance-*,google-explicit-constructor,google-build-using-namespace,google-runtime-int,misc-definitions-in-headers,modernize-use-nullptr,modernize-use-override,-bugprone-macro-parentheses,readability-braces-around-statements,-bugprone-branch-clone,readability-identifier-naming,hicpp-exception-baseclass,misc-throw-by-value-catch-by-reference,-bugprone-signed-char-misuse,-bugprone-misplaced-widening-cast,-bugprone-sizeof-expression,-bugprone-easily-swappable-parameters,google-global-names-in-headers,llvm-header-guard,misc-definitions-in-headers,modernize-use-emplace,modernize-use-bool-literals,-performance-inefficient-string-concatenation,-performance-no-int-to-ptr,readability-container-size-empty,cppcoreguidelines-pro-type-cstyle-cast,-llvm-header-guard,-performance-enum-size,cppcoreguidelines-pro-type-const-cast,cppcoreguidelines-avoid-non-const-global-variables,cppcoreguidelines-interfaces-global-init,cppcoreguidelines-slicing,cppcoreguidelines-rvalue-reference-param-not-moved,cppcoreguidelines-virtual-class-destructor,-readability-identifier-naming,-bugprone-exception-escape,-bugprone-unused-local-non-trivial-variable,-bugprone-empty-catch'
+WarningsAsErrors: '*'
+HeaderFilterRegex: 'src/include/duckdb/.*'
+FormatStyle:     none
+CheckOptions:
+  - key:             readability-identifier-naming.ClassCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.EnumCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.TypedefCase
+    value:           lower_case
+  - key:             readability-identifier-naming.TypedefSuffix
+    value:           _t
+  - key:             readability-identifier-naming.FunctionCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.MemberCase
+    value:           lower_case
+  - key:             readability-identifier-naming.ParameterCase
+    value:           lower_case
+  - key:             readability-identifier-naming.ConstantCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ConstantParameterCase
+    value:           lower_case
+  - key:             readability-identifier-naming.NamespaceCase
+    value:           lower_case
+  - key:             readability-identifier-naming.MacroDefinitionCase
+    value:           UPPER_CASE
+  - key:             readability-identifier-naming.StaticConstantCase
+    value:           UPPER_CASE
+  - key:             readability-identifier-naming.ConstantMemberCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.StaticVariableCase
+    value:           UPPER_CASE
+  - key:             readability-identifier-naming.ClassConstantCase
+    value:           UPPER_CASE
+  - key:             readability-identifier-naming.EnumConstantCase
+    value:           UPPER_CASE
+  - key:             readability-identifier-naming.ConstexprVariableCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.StaticConstantCase
+    value:           UPPER_CASE
+  - key:             readability-identifier-naming.TemplateTemplateParameterCase
+    value:           UPPER_CASE
+  - key:             readability-identifier-naming.TypeTemplateParameterCase
+    value:           UPPER_CASE
+  - key:             readability-identifier-naming.VariableCase
+    value:           lower_case
+  - key:             modernize-use-emplace.SmartPointers
+    value:           '::duckdb::shared_ptr;::duckdb::unique_ptr;::std::auto_ptr;::duckdb::weak_ptr'
+  - key:             cppcoreguidelines-rvalue-reference-param-not-moved.IgnoreUnnamedParams
+    value:           true
+
diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml
@@ -14,18 +14,20 @@ concurrency:
 jobs:
   duckdb-stable-build:
     name: Build extension binaries
-    uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.0.0
+    uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.1.3
     with:
-      duckdb_version: v1.0.0
+      duckdb_version: v1.1.3
+      ci_tools_version: v1.1.3
       extension_name: excel
 
   duckdb-stable-deploy:
     name: Deploy extension binaries
     needs: duckdb-stable-build
-    uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@v1.0.0
+    uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@v1.1.3
     secrets: inherit
     with:
-      duckdb_version: v1.0.0
+      duckdb_version: v1.1.3
+      ci_tools_version: v1.1.3
       extension_name: excel
       deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }}
       deploy_versioned: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,20 +1,35 @@
 cmake_minimum_required(VERSION 2.8.12...3.29)
 set(TARGET_NAME excel)
 set(EXTENSION_NAME ${TARGET_NAME}_extension)
+set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension)
 
 project(ExcelExtension)
 
-include_directories(extension/excel/numformat/include)
-include_directories(extension/excel/include)
-add_subdirectory(extension/excel/numformat)
+# Dependencies from VCPKG
+find_package(EXPAT REQUIRED)
+find_package(ZLIB REQUIRED)
+find_package(minizip-ng CONFIG REQUIRED)
 
-build_static_extension(excel extension/excel/excel_extension.cpp ${NUMFORMAT_OBJECT_FILES})
+include_directories(src/excel/numformat/include)
+include_directories(src/excel/include)
+add_subdirectory(src/excel/numformat)
+
+set(EXTENSION_SOURCES src/excel/excel_extension.cpp src/excel/xlsx/zip_file.cpp
+                      src/excel/xlsx/read_xlsx.cpp src/excel/xlsx/copy_xlsx.cpp)
+
+build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES}
+                       ${NUMFORMAT_OBJECT_FILES})
 set(PARAMETERS "-warnings")
-build_loadable_extension(excel ${PARAMETERS} extension/excel/excel_extension.cpp
+build_loadable_extension(${TARGET_NAME} ${PARAMETERS} ${EXTENSION_SOURCES}
                          ${NUMFORMAT_OBJECT_FILES})
 
+target_link_libraries(${EXTENSION_NAME} EXPAT::EXPAT MINIZIP::minizip-ng
+                      ZLIB::ZLIB)
+target_link_libraries(${LOADABLE_EXTENSION_NAME} EXPAT::EXPAT
+                      MINIZIP::minizip-ng ZLIB::ZLIB)
+
 install(
-  TARGETS excel_extension
+  TARGETS ${EXTENSION_NAME}
   EXPORT "${DUCKDB_EXPORT_SET}"
   LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
   ARCHIVE DESTINATION "${INSTALL_LIB_DIR}")
diff --git a/README.md b/README.md
@@ -17,3 +17,95 @@ SELECT text(1234567.897, '$#.##') AS result;
 # Documentation
 
 See the [Excel page](https://duckdb.org/docs/extensions/excel) in the DuckDB documentation.
+
+# XLSX Files
+
+## Reading XLSX Files
+
+`.xlsx` files can be read using the `read_xlsx` function. The following named parameters are supported.
+
+__Options__:
+
+| Option | Type | Default|  Description |
+| --- | --- | --- | --- |
+| `header` | `BOOLEAN` | _automatically inferred_  | Whether to treat the first row as containing the names of the resulting columns |
+| `sheet`| `VARCHAR` | _automatically inferred_ | The name of the sheet in the xlsx file to read. Default is the first sheet. |
+| `all_varchar` | `BOOLEAN` | `false` | Whether to read all cells as containing `VARCHAR`s. |
+| `ignore_errors` | `BOOLEAN` | `false` | Whether to ignore errors and silently replace cells that cant be cast to the corresponding inferred column type with `NULL`'s. |
+| `range` | `VARCHAR` |  _automatically inferred_ | The range of cells to read. For example, `A1:B2` reads the cells from A1 to B2. If not specified the resulting range will be inferred as rectangular region of cells between the first row of consecutive non-empty cells and the first empty row spanning the same columns |
+| `stop_at_empty` | `BOOLEAN` | `false/true` | Whether to stop reading the file when an empty row is encountered. If an explicit `range` option is provided, this is `false` by default, otherwise `true` | 
+| `empty_as_varchar` | `BOOLEAN` | `false` | Whether to treat empty cells as `VARCHAR` instead of `DOUBLE` when trying to automatically infer column types |
+
+__Example usage__:
+
+```sql
+SELECT * FROM read_xlsx('test.xlsx', header 'true');
+----
+┌────────┬────────┐
+│   a    │   b    │
+│ double │ double │
+├────────┼────────┤
+│    1.0 │    2.0 │
+│    3.0 │    4.0 │
+└────────┴────────┘
+
+-- Alternatively, we can use a xlsx file as a "replacement scans" and select from it immediately
+-- but without being able to pass options.
+
+SELECT * FROM 'test.xlsx';
+----
+┌────────┬────────┐
+│   a    │   b    │
+│ double │ double │
+├────────┼────────┤
+│    1.0 │    2.0 │
+│    3.0 │    4.0 │
+└────────┴────────┘
+```
+
+## Writing XLSX Files
+
+Writing `.xlsx` files is supported using the `COPY` statement with `XLSX` given as the format. The following additional parameters are supported.
+
+__Options__:
+
+| Option | Type | Default   | Description                                                                          |
+| --- | --- |-----------|--------------------------------------------------------------------------------------|
+| `header` | `BOOLEAN` | `false`   | Whether to write the column names as the first row in the sheet                      |
+| `sheet`| `VARCHAR` | `Sheet1`  | The name of the sheet in the xlsx file to write.                                     |
+| `sheet_row_limit` | `INTEGER` | `1048576` | The maximum number of rows in a sheet. An error is thrown if this limit is exceeded. |
+
+__Example usage__:
+
+```sql
+CREATE TABLE test AS SELECT * FROM (VALUES (1, 2), (3, 4)) AS t(a, b);
+COPY test TO 'test.xlsx' (format 'xlsx', header 'true');
+```
+
+## Type Conversions and Inference
+
+Because XLSX files only really support storing strings and numbers, the equivalent of `VARCHAR` and `DOUBLE`, the following type conversions are applied when writing XLSX files.
+- Numeric types are cast to `DOUBLE` when writing to an XLSX file.
+- Temporal types (`TIMESTAMP`, `DATE`, `TIME`, etc.) are converted to excel "serial" numbers, that is the number of days since 1900-01-01 for dates and the fraction of a day for times. These are then styled with a "number format" so that they appear as dates or times in Excel.   
+- `TIMESTAMP_TZ` and `TIME_TZ` are cast to UTC `TIMESTAMP` and `TIME` respectively, with the timezone information being lost.
+- `BOOLEAN`s are converted to `1` and `0`, with a "number format" applied to make them appear as `TRUE` and `FALSE` in Excel.
+- All other types are cast to `VARCHAR` and then written as text cells.
+
+When reading XLSX files, almost everything is read as either `DOUBLE` or `VARCHAR` depending on the Excel cell type. However, there are some caveats.
+- We try to infer `TIMESTAMP`, `TIME`, `DATE` and `BOOLEAN` types when possible based on the cell format.
+- We infer text cells containing `TRUE` and `FALSE` as `BOOLEAN`, but that is the only type-inference we do that is based on the actual content of the cell.
+- Empty cells are considered to be `DOUBLE` by default, unless the `empty_as_varchar` option is set to `true`, in which case they are typed as `VARCHAR`.
+
+If the `all_varchar` option is set to `true`, none of the above applies and all cells are read as `VARCHAR`.
+
+When no types are specified explicitly, (e.g. when using the `read_xlsx` function instead of `COPY TO ... FROM '<file>.xlsx'`) 
+the types of the resulting columns are inferred based on the first "data" row in the sheet, that is:
+- If no explicit range is given
+  - The first row after the header if a header is found or forced by the `header` option
+  - The first non-empty row in the sheet if no header is found or forced
+- If an explicit range is given
+  - The second row of the range if a header is found in the first row or forced by the `header` option
+  - The first row of the range if no header is found or forced 
+
+This can sometimes lead to issues if the first "data row" is not representative of the rest of the sheet (e.g. it contains empty cells) in which case the `ignore_errors` or `empty_as_varchar` options can be used to work around this. 
+Alternatively, when the `COPY TO ... FROM '<file>.xlsx'` syntax is used, no type inference is done and the types of the resulting columns are determined by the types of the columns in the table being copied to. All cells will simply be converted by casting from `DOUBLE` or `VARCHAR` to the target column type.
diff --git a/duckdb b/duckdb
diff --git a/extension-ci-tools b/extension-ci-tools
diff --git a/extension/excel/excel_config.py b/extension/excel/excel_config.py
+39 −0		.github/workflows/TestCITools.yml
+5 −4		.github/workflows/_extension_deploy.yml
+430 −81		.github/workflows/_extension_distribution.yml
+6 −1		README.md
+63 −0		config/distribution_matrix.json
+5 −0		docker/README.md
+99 −0		docker/linux_amd64/Dockerfile
+80 −0		docker/linux_amd64_gcc4/Dockerfile
+98 −0		docker/linux_arm64/Dockerfile
+34 −7		makefiles/duckdb_extension.Makefile
+83 −0		scripts/append_extension_metadata.py
+75 −0		scripts/modify_distribution_matrix.py