Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@ rust-version = "1.71"
edition = "2021"

[features]
all = ["cbor", "yaml"]
all = ["cbor", "yaml", "csv"]
cbor = ["dep:hex", "dep:serde_cbor"]
default = []
yaml = ["dep:serde_yaml"]
csv = ["dep:csv"]

[profile.release]
lto = "thin"
Expand Down Expand Up @@ -77,6 +78,10 @@ version = "0.11.2"
optional = true
version = "0.9.34"

[dependencies.csv]
optional = true
version = "1.3.1"

[dev-dependencies]
proptest = "1.5.0"
strum = "0.26.3"
Expand Down
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ Tokei is a program that displays statistics about your code. Tokei will show the
- Tokei has huge range of languages, supporting over **150** languages, and
their various extensions.

- Tokei can output in multiple formats(**CBOR**, **JSON**, **YAML**)
- Tokei can output in multiple formats(**CBOR**, **JSON**, **YAML**, **CSV**)
allowing Tokei's output to be easily stored, and reused. These can also be
reused in tokei combining a previous run's statistics with another set.

Expand Down Expand Up @@ -219,12 +219,16 @@ tokei with the features flag.

YAML:
cargo install tokei --features yaml

CSV:
cargo install tokei --features csv
```

**Currently supported formats**
- JSON `--output json`
- YAML `--output yaml`
- CBOR `--output cbor`
- CSV `--output csv`

```shell
$ tokei ./foo --output json
Expand Down Expand Up @@ -268,7 +272,7 @@ OPTIONS:
-i, --input <file_input> Gives statistics from a previous tokei run. Can be given a file path, or "stdin" to
read from stdin.
-o, --output <output> Outputs Tokei in a specific format. Compile with additional features for more format
support. [possible values: cbor, json, yaml]
support. [possible values: cbor, json, yaml, csv]
-s, --sort <sort> Sort languages based on column [possible values: files, lines, blanks, code, comments]
-t, --type <types> Filters output by language type, separated by a comma. i.e. -t=Rust,Markdown

Expand Down
2 changes: 1 addition & 1 deletion src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ impl Cli {
cli
}

pub fn file_input(&self) -> Option<&str> {
pub fn file_input(&self) -> Option<String> {
self.matches.get_one("file_input").cloned()
}

Expand Down
269 changes: 269 additions & 0 deletions src/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,10 @@ supported_formats!(
(yaml, "yaml", Yaml [serde_yaml]) =>
serde_yaml::from_str,
serde_yaml::to_string,

(csv, "csv", Csv [csv]) =>
serialize_csv::from_str,
serialize_csv::to_string,
);

pub fn add_input(input: &str, languages: &mut Languages) -> bool {
Expand Down Expand Up @@ -204,6 +208,271 @@ fn convert_input(contents: &str) -> Option<LanguageMap> {
self::Format::parse(contents)
}

#[cfg(feature = "csv")]
mod serialize_csv {
//! CSV serialization
//!
//! Linearizes hierarchical blob structures into flat CSV format.
//!
//! Files contain Reports with CodeStats that have nested blobs:
//!
//! ```
//! README.md (Markdown)
//! └─ Rust (code block)
//! └─ Markdown (comment nested within Rust)
//! ```
//!
//! are flattened using `nested` column:
//!
//! | File | Language | Nested | Lines | Code | Comments | Blanks |
//! |-----------|----------|-----------------|-------|------|----------|--------|
//! | README.md | Markdown | "" | 100 | 80 | 15 | 5 |
//! | README.md | Markdown | "Rust" | 50 | 45 | 3 | 2 |
//! | README.md | Markdown | "Rust,Markdown" | 20 | 18 | 1 | 1 |
//!
//! using depth-first traversal of the CodeStats blob tree.

use std::collections::hash_map::Entry;
use std::collections::BTreeMap;
use std::collections::HashMap;
use std::error::Error;
use std::path::PathBuf;

use super::LanguageMap;
use super::Output;
use serde::Deserialize;
use serde::Deserializer;
use serde::Serialize;
use tokei::CodeStats;
use tokei::Language;
use tokei::LanguageType;
use tokei::Report;

/// CSV record for language statistics.
///
/// Represents either:
/// - Primary file stats (nested = empty)
/// - Nested blob stats (nested = path to blob)
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "PascalCase")]
struct Record {
file: PathBuf,
/// File's primary language (constant for all blobs)
language: LanguageType,
/// Comma-separated nested path (e.g., "Rust,Markdown")
#[serde(
serialize_with = "Record::serialize_nested_langs",
deserialize_with = "Record::deserialize_nested_langs"
)]
nested: Vec<LanguageType>,
lines: usize,
code: usize,
comments: usize,
blanks: usize,
/// Accuracy flag
inaccurate: bool,
}

impl Record {
fn new(
file: PathBuf,
language: LanguageType,
nested: Vec<LanguageType>,
inaccurate: bool,
stats: &CodeStats,
) -> Self {
Self {
file,
language,
nested,
lines: stats.lines(),
code: stats.code,
comments: stats.comments,
blanks: stats.blanks,
inaccurate,
}
}

fn serialize_nested_langs<S>(
nested: &[LanguageType],
serializer: S,
) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if nested.is_empty() {
return serializer.serialize_str("");
}
let s = nested
.iter()
.map(LanguageType::to_string)
.collect::<Vec<String>>()
.join(",");
serializer.serialize_str(&s)
}

fn deserialize_nested_langs<'de, D>(deserializer: D) -> Result<Vec<LanguageType>, D::Error>
where
D: Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
let s = s.trim();

if s.is_empty() {
return Ok(Vec::new());
}

s.split(',')
.map(|x| {
LanguageType::from_name(x.trim())
.ok_or_else(|| serde::de::Error::custom(format!("Unknown language {x}")))
})
.collect()
}

fn to_code_stats(&self) -> CodeStats {
let mut cs = CodeStats::new();

cs.blanks = self.blanks;
cs.code = self.code;
cs.comments = self.comments;

cs
}

fn to_report(&self) -> Report {
let mut report = Report::new(self.file.clone());
report.stats += self.to_code_stats();
report
}
}

/// Recursively serializes blob tree to CSV records.
///
/// Depth-first traversal maintaining current path in `nested` vector.
fn serialize_blobs(
csv: &mut csv::Writer<Vec<u8>>,
file: &PathBuf,
primary: LanguageType,
nested: &mut Vec<LanguageType>,
blobs: &BTreeMap<LanguageType, CodeStats>,
) -> Result<(), Box<dyn Error>> {
for (lang_type, stats) in blobs {
nested.push(*lang_type);

csv.serialize(Record::new(
file.clone(),
primary,
nested.clone(),
false,
stats,
))?;

serialize_blobs(csv, file, primary, nested, &stats.blobs)?;
nested.pop();
}

Ok(())
}

pub(super) fn to_string(output: &Output) -> Result<String, Box<dyn Error>> {
let mut csv = csv::Writer::from_writer(vec![]);

for (lang_type, lang) in &output.languages {
for report in &lang.reports {
csv.serialize(Record::new(
report.name.clone(),
*lang_type,
Vec::new(),
lang.inaccurate,
&report.stats,
))?;
let mut nested = Vec::new();
serialize_blobs(
&mut csv,
&report.name,
*lang_type,
&mut nested,
&report.stats.blobs,
)?;
}
}

Ok(String::from_utf8(csv.into_inner()?)?)
}

/// Parses CSV string into Output structure
///
/// Reconstructs hierarchical blob structure from linearized CSV.
///
/// Steps:
/// 1. Parse CSV records, group by file
/// 2. Use `nested` field as navigation path to rebuild blob tree
/// 3. Sort by original order, aggregate statistics
///
/// Example (simplified):
///
/// | File | Language | Nested | Lines |
/// |-----------|----------|-----------------|-------|
/// | README.md | Markdown | "" | 100 |
/// | README.md | Markdown | "Rust" | 50 |
/// | README.md | Markdown | "Rust,Markdown" | 20 |
///
/// becomes:
///
/// ```
/// README.md (Markdown) {
/// stats: 100 lines
/// blobs: {
/// Rust: {
/// stats: 50 lines
/// blobs: { Markdown: { 20 lines } }
/// }
/// }
/// }
/// ```
pub(super) fn from_str(s: &str) -> Result<Output, Box<dyn Error>> {
let mut csv = csv::Reader::from_reader(s.as_bytes());
let mut files: HashMap<PathBuf, (usize, LanguageType, Report)> = HashMap::new();

// Parse CSV records and group by file
for (idx, record) in csv.deserialize::<Record>().enumerate() {
let record = record?;
match files.entry(record.file.clone()) {
Entry::Occupied(mut entry) => {
let (_, _, report) = entry.get_mut();

// Navigate blob tree path, create missing nodes
let stats = record.nested.iter().fold(&mut report.stats, |stats, lang| {
stats.blobs.entry(*lang).or_default()
});
*stats += record.to_code_stats();
}
Entry::Vacant(entry) => {
entry.insert((idx, record.language, record.to_report()));
}
}
}

let mut languages = LanguageMap::new();
let mut totals = Language::new();

// Sort by original order and aggregate
let mut sorted_files: Vec<_> = files.into_values().collect();
sorted_files.sort_unstable_by_key(|(idx, _, _)| *idx);

for (_, lang_type, report) in sorted_files {
totals.add_report(report.clone());
languages.entry(lang_type).or_default().add_report(report);
}

languages.values_mut().for_each(Language::total);
totals.total();

Ok(Output { languages, totals })
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
Loading
Loading