Data Explorer: Fix failures with DuckDB CSV reader on files with colu…

…mns having difficult-to-infer data types (#5764) Addresses #5746. In very large CSV files, it can be possible for DuckDB to infer an integer type for a column and then subsequently have an error when attempting to convert the rest of the data file to integers. One such data file is found at https://s3.amazonaws.com/data.patentsview.org/download/g_patent.tsv.zip. This PR changes the CSV importing to fall back on `sample_size=-1` (which uses the entire file to do type inference, rather than a sample of rows) in these exceptional cases. This means it takes longer to load the file, but this is better than completely failing. I made a couple of other incidental changes: * Always use `CREATE TABLE` when importing CSV files, which gives better performance at the exchange of memory use (we can wait for people to complain about memory use problems before working more on this -- using a temporary local DuckDB database file instead of an in-memory one is one way around this potentially). I made sure that file live updates weren't broken by these changes. * Always use `CREATE VIEW` with Parquet files, since single-threaded DuckDB is plenty snappy without converting the Parquet file to its own internal data format. ## QA Notes Loading this 1GB TSV file into the data explorer takes 10s of seconds because duckdb-wasm is single-threaded, so just wait! It will eventually load.
posit-dev · Dec 18, 2024 · 130349a · 130349a
1 parent 54473a7
commit 130349a
Showing 1 changed file with 49 additions and 37 deletions.
diff --git a/extensions/positron-duckdb/src/extension.ts b/extensions/positron-duckdb/src/extension.ts
@@ -1221,7 +1221,7 @@ export class DataExplorerRpcHandler {
 				fs.watchFile(filePath, { interval: 1000 }, async () => {
 					const newTableName = `positron_${this._tableIndex++}`;
 
-					await this.createTableFromFilePath(filePath, newTableName);
+					await this.createTableFromFilePath(filePath, newTableName, true);
 
 					const newSchema = (await this.db.runQuery(`DESCRIBE ${newTableName};`)).toArray();
 					await tableView.onFileUpdated(newTableName, newSchema);
@@ -1237,53 +1237,65 @@ export class DataExplorerRpcHandler {
 		return {};
 	}
 
-	async createTableFromFilePath(filePath: string, catalogName: string) {
-		// Depending on the size of the file, we use CREATE VIEW (less memory,
-		// but slower) vs CREATE TABLE (more memory but faster). We may tweak this threshold,
-		// and especially given that Parquet files may materialize much larger than that appear
-		// on disk.
-		const VIEW_THRESHOLD = 25_000_000;
-		const fileStats = fs.statSync(filePath);
-
-		const getCtasQuery = (filePath: string, catalogType: string = 'TABLE') => {
-			const fileExt = path.extname(filePath);
-			let scanOperation;
-			switch (fileExt) {
-				case '.parquet':
-				case '.parq':
-					scanOperation = `parquet_scan('${filePath}')`;
-					break;
-				// TODO: Will need to be able to pass CSV / TSV options from the
-				// UI at some point.
-				case '.csv':
-					scanOperation = `read_csv('${filePath}')`;
-					break;
-				case '.tsv':
-					scanOperation = `read_csv('${filePath}', delim='\t')`;
-					break;
-				default:
-					throw new Error(`Unsupported file extension: ${fileExt}`);
+	/**
+	 * Import data file into DuckDB by creating table or view
+	 * @param filePath The file path on disk.
+	 * @param catalogName The table name to use in the DuckDB catalog.
+	 * @param forceRefresh If true, force an uncached read from disk to circumvent DuckDB's
+	 * file handle caching.
+	 */
+	async createTableFromFilePath(filePath: string, catalogName: string,
+		forceRefresh: boolean = false) {
+
+		const getCsvImportQuery = (_filePath: string, options: Array<String>) => {
+			return `CREATE OR REPLACE TABLE ${catalogName} AS
+			SELECT * FROM read_csv_auto('${_filePath}'${options.length ? ', ' : ''}${options.join(' ,')});`;
+		};
+
+		const importDelimited = async (filePath: string, catalogType: string = 'TABLE',
+			extraParams: string = '') => {
+			// TODO: Will need to be able to pass CSV / TSV options from the
+			// UI at some point.
+			const options: Array<string> = [];
+			if (fileExt === '.csv') {
+				options.push('delim=\',\'');
+			} else if (fileExt === '.tsv') {
+				options.push('delim=\'\t\'');
+			} else {
+				throw new Error(`Unsupported file extension: ${fileExt}`);
+			}
+
+			let query = getCsvImportQuery(filePath, options);
+			try {
+				await this.db.runQuery(query);
+			} catch (error) {
+				// Retry with sample_size=-1 to disable sampling if type inference fails
+				options.push('sample_size=-1');
+				query = getCsvImportQuery(filePath, options);
+				await this.db.runQuery(query);
 			}
-			return `CREATE ${catalogType} ${catalogName} AS
-			SELECT * FROM ${scanOperation};`;
 		};
 
-		if (fileStats.size < VIEW_THRESHOLD) {
-			// For smaller files, read the entire contents and register it as a temp file to avoid
-			// caching in duckdb-wasm
+		const fileExt = path.extname(filePath);
+
+		if (fileExt === '.parquet' || fileExt === '.parq') {
+			// Always create a view for Parquet files
+			const query = `CREATE VIEW ${catalogName} AS
+			SELECT * FROM parquet_scan('${filePath}');`;
+			await this.db.runQuery(query);
+		} else if (forceRefresh) {
+			// For smaller text files, read the entire contents and register it as a temp file
+			// to avoid caching in duckdb-wasm
 			const fileContents = fs.readFileSync(filePath, { encoding: null });
 			const virtualPath = path.basename(filePath);
 			await this.db.db.registerFileBuffer(virtualPath, fileContents);
-
-			const ctasQuery = getCtasQuery(virtualPath);
 			try {
-				await this.db.runQuery(ctasQuery);
+				await importDelimited(virtualPath);
 			} finally {
 				await this.db.db.dropFile(virtualPath);
 			}
 		} else {
-			const ctasQuery = getCtasQuery(filePath);
-			await this.db.runQuery(ctasQuery);
+			await importDelimited(filePath);
 		}
 	}