Add LOC checking + option calculateLines: #34

Nixinova · Sep 14, 2024 · 49e5047 · 49e5047
1 parent 2403222
commit 49e5047
Show file tree

Hide file tree

Showing 8 changed files with 118 additions and 28 deletions.
diff --git a/changelog.md b/changelog.md
@@ -2,7 +2,9 @@
 
 ## Next
 - Added application of GitHub-Linguist override rule `linguist-detectable`.
+- Added line of code calculation to the output.
 - Added option `checkDetected` to control the application of `linguist-detectable` overrides.
+- Added option `calculateLines` (defaults to true) to control whether LOC calculations are performed.
 
 ## 2.7.1
 *2024-05-08*

diff --git a/readme.md b/readme.md
@@ -50,6 +50,11 @@ Running LinguistJS on this folder will return the following JSON:
   "files": {
     "count": 5,
     "bytes": 6020,
+	"lines": {
+		"total": 100,
+		"content": 90,
+		"code": 80,
+	},
     "results": {
       "/src/index.ts": "TypeScript",
       "/src/cli.js": "JavaScript",
@@ -64,16 +69,26 @@ Running LinguistJS on this folder will return the following JSON:
   "languages": {
     "count": 3,
     "bytes": 6010,
+	"lines": {
+		"total": 90,
+		"content": 80,
+		"code": 70,
+	},
     "results": {
-        "JavaScript": { "type": "programming", "bytes": 1000, "color": "#f1e05a" },
-        "Markdown": { "type": "prose", "bytes": 3000, "color": "#083fa1" },
-        "Ruby": { "type": "programming", "bytes": 10, "color": "#701516" },
-        "TypeScript": { "type": "programming", "bytes": 2000, "color": "#2b7489" },
+        "JavaScript": { "type": "programming", "bytes": 1000, "lines": { "total": 49, "content": 49, "code": 44 }, "color": "#f1e05a" },
+        "Markdown": { "type": "prose", "bytes": 3000, "lines": { "total": 10, "content": 5, "code": 5 }, "color": "#083fa1" },
+        "Ruby": { "type": "programming", "bytes": 10, "lines": { "total": 1, "content": 1, "code": 1 }, "color": "#701516" },
+        "TypeScript": { "type": "programming", "bytes": 2000, "lines": { "total": 30, "content": 25, "code": 20 }, "color": "#2b7489" },
     },
   },
   "unknown": {
     "count": 1,
     "bytes": 10,
+	"lines": {
+		"total": 10,
+		"content": 10,
+		"code": 10,
+	},
     "filenames": {
       "no-lang": 10,
     },
@@ -130,6 +145,8 @@ const { files, languages, unknown } = await linguist(fileNames, { fileContent, .
       Alias for `checkAttributes:false, checkIgnored:false, checkDetected:false, checkHeuristics:false, checkShebang:false, checkModeline:false`.
     - `offline` (boolean):
       Whether to use pre-packaged metadata files instead of fetching them from GitHub at runtime (defaults to `false`).
+    - `calculateLines` (boolean):
+      Whether to calculate line of code totals (defaults to `true`).
     - `keepVendored` (boolean):
       Whether to keep vendored files (dependencies, etc) (defaults to `false`).
       Does nothing when `fileContent` is set.
@@ -189,6 +206,8 @@ linguist --version
     Alias for `--checkAttributes=false --checkIgnored=false --checkHeuristics=false --checkShebang=false --checkModeline=false`.
   - `--offline`:
     Use pre-packaged metadata files instead of fetching them from GitHub at runtime.
+  - `--calculateLines`:
+    Calculate line of code totals from files.
   - `--keepVendored`:
     Include vendored files (auto-generated files, dependencies folder, etc) in the output.
   - `--keepBinary`:

diff --git a/src/cli.ts b/src/cli.ts
@@ -24,6 +24,7 @@ program
 	.option('-F|--listFiles [bool]', 'Whether to list every matching file under the language results', false)
 	.option('-q|--quick [bool]', 'Skip complex language analysis (alias for -{A|I|H|S}=false)', false)
 	.option('-o|--offline [bool]', 'Use packaged data files instead of fetching latest from GitHub', false)
+	.option('-L|--calculateLines [bool]', 'Calculate lines of code totals', true)
 	.option('-V|--keepVendored [bool]', 'Prevent skipping over vendored/generated files', false)
 	.option('-B|--keepBinary [bool]', 'Prevent skipping over binary files', false)
 	.option('-r|--relativePaths [bool]', 'Convert absolute file paths to relative', false)
@@ -78,16 +79,17 @@ if (args.analyze) (async () => {
 			}
 		}
 		// List parsed results
-		for (const [lang, { bytes, color }] of sortedEntries) {
+		for (const [lang, { bytes, lines, color }] of sortedEntries) {
 			const percent = (bytes: number) => bytes / (totalBytes || 1) * 100;
 			const fmtd = {
 				index: (++count).toString().padStart(2, ' '),
 				lang: lang.padEnd(24, ' '),
 				percent: percent(bytes).toFixed(2).padStart(5, ' '),
 				bytes: bytes.toLocaleString().padStart(10, ' '),
+				loc: lines.code.toLocaleString().padStart(10, ' '),
 				icon: colouredMsg(hexToRgb(color ?? '#ededed'), '\u2588'),
 			};
-			console.log(`  ${fmtd.index}. ${fmtd.icon} ${fmtd.lang} ${fmtd.percent}% ${fmtd.bytes} B`);
+			console.log(`  ${fmtd.index}. ${fmtd.icon} ${fmtd.lang} ${fmtd.percent}% ${fmtd.bytes} B ${fmtd.loc} LOC`);
 
 			// If using `listFiles` option, list all files tagged as this language
 			if (args.listFiles) {

diff --git a/src/helpers/read-file.ts b/src/helpers/read-file.ts
@@ -4,7 +4,7 @@ import fs from 'fs';
  * Read part of a file on disc.
  * @throws 'EPERM' if the file is not readable.
  */
-export default async function readFile(filename: string, onlyFirstLine: boolean = false): Promise<string> {
+export default async function readFileChunk(filename: string, onlyFirstLine: boolean = false): Promise<string> {
 	const chunkSize = 100;
 	const stream = fs.createReadStream(filename, { highWaterMark: chunkSize });
 	let content = '';

diff --git a/src/index.ts b/src/index.ts
@@ -8,7 +8,7 @@ import { isBinaryFile } from 'isbinaryfile';
 
 import walk from './helpers/walk-tree';
 import loadFile, { parseGeneratedDataFile } from './helpers/load-data';
-import readFile from './helpers/read-file';
+import readFileChunk from './helpers/read-file';
 import parseAttributes, { FlagAttributes } from './helpers/parse-gitattributes';
 import pcre from './helpers/convert-pcre';
 import { normPath } from './helpers/norm-path';
@@ -24,6 +24,7 @@ async function analyse(rawPaths?: string | string[], opts: T.Options = {}): Prom
 
 	// Normalise input option arguments
 	opts = {
+		calculateLines: opts.calculateLines ?? true, // default to true if unset
 		checkIgnored: !opts.quick,
 		checkDetected: !opts.quick,
 		checkAttributes: !opts.quick,
@@ -46,9 +47,9 @@ async function analyse(rawPaths?: string | string[], opts: T.Options = {}): Prom
 	const extensions: Record<T.AbsFile, string> = {};
 	const globOverrides: Record<T.AbsFile, T.LanguageResult> = {};
 	const results: T.Results = {
-		files: { count: 0, bytes: 0, results: {}, alternatives: {} },
-		languages: { count: 0, bytes: 0, results: {} },
-		unknown: { count: 0, bytes: 0, extensions: {}, filenames: {} },
+		files: { count: 0, bytes: 0, lines: { total: 0, content: 0, code: 0 }, results: {}, alternatives: {} },
+		languages: { count: 0, bytes: 0, lines: { total: 0, content: 0, code: 0 }, results: {} },
+		unknown: { count: 0, bytes: 0, lines: { total: 0, content: 0, code: 0 }, extensions: {}, filenames: {} },
 	};
 
 	// Set a common root path so that vendor paths do not incorrectly match parent folders
@@ -107,7 +108,7 @@ async function analyse(rawPaths?: string | string[], opts: T.Options = {}): Prom
 		for (const attrFile of nestedAttrFiles) {
 			const relAttrFile = relPath(attrFile);
 			const relAttrFolder = paths.dirname(relAttrFile);
-			const contents = await readFile(attrFile);
+			const contents = await readFileChunk(attrFile);
 			const parsed = parseAttributes(contents, relAttrFolder);
 			for (const { glob, attrs } of parsed) {
 				manualAttributes[glob] = attrs;
@@ -231,7 +232,7 @@ async function analyse(rawPaths?: string | string[], opts: T.Options = {}): Prom
 			firstLine = manualFileContent[files.indexOf(file)]?.split('\n')[0] ?? null;
 		}
 		else if (fs.existsSync(file) && !fs.lstatSync(file).isDirectory()) {
-			firstLine = await readFile(file, true).catch(() => null);
+			firstLine = await readFileChunk(file, true).catch(() => null);
 		}
 		else continue;
 
@@ -347,7 +348,7 @@ async function analyse(rawPaths?: string | string[], opts: T.Options = {}): Prom
 				}
 
 				// Check file contents and apply heuristic patterns
-				const fileContent = opts.fileContent ? manualFileContent[files.indexOf(file)] : await readFile(file).catch(() => null);
+				const fileContent = opts.fileContent ? manualFileContent[files.indexOf(file)] : await readFileChunk(file).catch(() => null);
 
 				// Skip if file read errors
 				if (fileContent === null) continue;
@@ -373,7 +374,6 @@ async function analyse(rawPaths?: string | string[], opts: T.Options = {}): Prom
 	}
 
 	// Skip specified categories
-	// todo linguist-detectable
 	if (opts.categories?.length) {
 		const categories: T.Category[] = ['data', 'markup', 'programming', 'prose'];
 		const hiddenCategories = categories.filter(cat => !opts.categories!.includes(cat));
@@ -417,8 +417,21 @@ async function analyse(rawPaths?: string | string[], opts: T.Options = {}): Prom
 	// Load language bytes size
 	for (const [file, lang] of Object.entries(results.files.results)) {
 		if (lang && !langData[lang]) continue;
+		// Calculate file size
 		const fileSize = manualFileContent[files.indexOf(file)]?.length ?? fs.statSync(file).size;
-		results.files.bytes += fileSize;
+		// Calculate lines of code
+		const loc = { total: 0, content: 0, code: 0 };
+		if (opts.calculateLines) {
+			// TODO: catch error?
+			const fileContent = (manualFileContent[files.indexOf(file)] ?? fs.readFileSync(file).toString()) ?? '';
+			const allLines = fileContent.split(/\r?\n/gm);
+			loc.total = allLines.length;
+			loc.content = allLines.filter(line => line.trim().length > 0).length;
+			const codeLines = fileContent
+				.replace(/^\s*(\/\/|# |;|--).+/gm, '')
+				.replace(/\/\*.+\*\/|<!--.+-->/sg, '')
+			loc.code = codeLines.split(/\r?\n/gm).filter(line => line.trim().length > 0).length;
+		}
 		// If no language found, add extension in other section
 		if (!lang) {
 			const ext = paths.extname(file);
@@ -427,16 +440,36 @@ async function analyse(rawPaths?: string | string[], opts: T.Options = {}): Prom
 			results.unknown[unknownType][name] ??= 0;
 			results.unknown[unknownType][name] += fileSize;
 			results.unknown.bytes += fileSize;
+			results.unknown.lines.total += loc.total;
+			results.unknown.lines.content += loc.content;
+			results.unknown.lines.code += loc.code;
 			continue;
 		}
 		// Add language and bytes data to corresponding section
 		const { type } = langData[lang];
-		results.languages.results[lang] ??= { type, bytes: 0, color: langData[lang].color };
+		results.languages.results[lang] ??= { type, bytes: 0, lines: { total: 0, content: 0, code: 0 }, color: langData[lang].color };
 		if (opts.childLanguages) {
 			results.languages.results[lang].parent = langData[lang].group;
 		}
+		// apply file sizes
+		results.files.bytes += fileSize;
 		results.languages.results[lang].bytes += fileSize;
 		results.languages.bytes += fileSize;
+		// apply LOC calculations
+		results.files.lines.total += loc.total;
+		results.files.lines.content += loc.content;
+		results.files.lines.code += loc.code;
+		results.languages.results[lang].lines.total += loc.total;
+		results.languages.results[lang].lines.content += loc.content;
+		results.languages.results[lang].lines.code += loc.code;
+		results.languages.lines.total += loc.total;
+		results.languages.lines.content += loc.content;
+		results.languages.lines.code += loc.code;
+	}
+
+	// Set lines output to NaN when line calculation is disabled
+	if (opts.calculateLines === false) {
+		results.files.lines = { total: NaN, content: NaN, code: NaN }
 	}
 
 	// Set counts

diff --git a/src/types.ts b/src/types.ts
@@ -21,6 +21,7 @@ export interface Options {
 	childLanguages?: boolean
 	quick?: boolean
 	offline?: boolean
+	calculateLines?: boolean
 	checkIgnored?: boolean
 	checkDetected?: boolean
 	checkAttributes?: boolean
@@ -33,15 +34,30 @@ export interface Results {
 	files: {
 		count: Integer
 		bytes: Bytes
+		lines: {
+			total: Integer
+			content: Integer
+			code: Integer
+		}
 		/** Note: Results use slashes as delimiters even on Windows. */
 		results: Record<FilePath, LanguageResult>
 		alternatives: Record<FilePath, LanguageResult[]>
 	}
 	languages: {
 		count: Integer
 		bytes: Bytes
+		lines: {
+			total: Integer
+			content: Integer
+			code: Integer
+		}
 		results: Record<Language, {
 			bytes: Bytes
+			lines: {
+				total: Integer
+				content: Integer
+				code: Integer
+			}
 			type: Category
 			parent?: Language
 			color?: `#${string}`
@@ -50,6 +66,11 @@ export interface Results {
 	unknown: {
 		count: Integer
 		bytes: Bytes
+		lines: {
+			total: Integer
+			content: Integer
+			code: Integer
+		}
 		extensions: Record<string, Bytes>
 		filenames: Record<string, Bytes>
 	}

diff --git a/test/expected.json b/test/expected.json
@@ -1,14 +1,16 @@
 {
 	"files": {
-		"count": 11,
-		"bytes": 69,
+		"count": 12,
+		"bytes": 190,
+		"lines": { "total": 25, "content": 15, "code": 10 },
 		"results": {
 			"~/al.al": "Perl",
 			"~/alternatives.asc": "AGS Script",
 			"~/file.txt": "JavaScript",
 			"~/folder/file.txt": "JavaScript",
 			"~/folder/sub.txt": "Text",
 			"~/hashbang": "JavaScript",
+			"~/loc.c": "C",
 			"~/modeline.txt": "C++",
 			"~/package-lock.json": "JSON",
 			"~/detected.json": "JSON",
@@ -20,21 +22,23 @@
 		}
 	},
 	"languages": {
-		"count": 7,
-		"bytes": 56,
+		"count": 8,
+		"bytes": 190,
 		"results": {
-			"Perl": { "type": "programming", "bytes": 0, "color": "#0298c3" },
-			"AGS Script": { "type": "programming", "bytes": 14, "color": "#B9D9FF" },
-			"JavaScript": { "type": "programming", "bytes": 23, "color": "#f1e05a" },
-			"JSON": { "type": "data", "bytes": 6, "color": "#292929"},
-			"Text": { "type": "prose", "bytes": 0 },
-			"C++": { "type": "programming", "bytes": 15, "color": "#f34b7d" },
-			"TOML": { "type": "data", "bytes": 0, "color": "#9c4221" }
+			"Perl": { "type": "programming", "bytes": 0, "lines": { "total": 1, "content": 0, "code": 0 },"color": "#0298c3" },
+			"AGS Script": { "type": "programming", "bytes": 14, "lines": { "total": 2, "content": 1, "code": 1 },"color": "#B9D9FF" },
+			"JSON": { "type": "data", "bytes": 8, "lines": { "total": 4, "content": 2, "code": 2 },"color": "#292929"},
+			"JavaScript": { "type": "programming", "bytes": 23, "lines": { "total": 4, "content": 3, "code": 3 },"color": "#f1e05a" },
+			"Text": { "type": "prose", "bytes": 0, "lines": { "total": 1, "content": 0, "code": 0 } },
+			"C": { "type": "programming", "bytes": 130, "lines": { "total": 10, "content": 8, "code": 4 }, "color": "#555555"},
+			"C++": { "type": "programming", "bytes": 15, "lines": { "total": 2, "content": 1, "code": 0 }, "color": "#f34b7d" },
+			"TOML": { "type": "data", "bytes": 0, "lines": { "total": 1, "content": 0, "code": 0 }, "color": "#9c4221" }
 		}
 	},
 	"unknown": {
 		"count": 1,
 		"bytes": 9,
+		"lines": { "total": 2, "content": 1, "code": 1 },
 		"extensions": {},
 		"filenames": {
 			"unknown": 9

diff --git a/test/samples/loc.c b/test/samples/loc.c
@@ -0,0 +1,9 @@
+#include <stdio.h>
+
+// empty line above
+int main() {
+	return 1;
+}
+/*
+total lines: 10, content lines: 8, code lines: 4
+*/