Skip to content

Commit

Permalink
Merge branch 'main' into new-yaml-format
Browse files Browse the repository at this point in the history
  • Loading branch information
pudo committed Jan 24, 2025
2 parents f30a1b3 + ff30220 commit 8d0790a
Show file tree
Hide file tree
Showing 9 changed files with 102 additions and 57 deletions.
4 changes: 3 additions & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ tag_name = {new_version}
commit = True
tag = True

[bumpversion:file:setup.py]
[bumpversion:file:pyproject.toml]
search = version = "{current_version}"
replace = version = "{new_version}"
7 changes: 4 additions & 3 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt-get install -y libicu-dev
python -m pip install --upgrade pip wheel pyicu setuptools
python -m pip install --upgrade pip wheel pyicu
pip install -e ".[dev]"
- name: Generate data file
run: |
Expand All @@ -32,10 +32,11 @@ jobs:
run: |
make test
- name: Build a distribution
if: matrix.python == '3.12'
run: |
python setup.py sdist bdist_wheel
python3 -m build --wheel
- name: Publish a Python distribution to PyPI
if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
if: matrix.python == '3.12' && github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@release/v1
with:
skip-existing: true
Expand Down
2 changes: 0 additions & 2 deletions MANIFEST.in

This file was deleted.

2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ all: generate clean test
generate:
python fingerprints/types/check.py
python fingerprints/types/compile.py
black fingerprints/types/data.py
ruff format fingerprints/types/data.py

test:
pytest --cov=fingerprints --cov-report html --cov-report term
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,7 @@ Wikipedia also maintains an index of [types of business entity](https://en.wikip

* [Clustering in Depth](https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth), part of the OpenRefine documentation discussing how to create collisions in data clustering.
* [probablepeople](https://github.com/datamade/probablepeople), parser for western names made by the brilliant folks at datamade.us.
* The study [Developing a Legal Form Classification and Extraction Approach For Company Entity Matching](https://www.tib-op.org/ojs/index.php/bis/article/view/44) by Kruse et al. (2021) investigates four approaches for identifying and classifying legal forms in company names.
* List of Legal Forms from [AnaCredit dataset](https://www.ecb.europa.eu/stats/ecb_statistics/anacredit/html/index.en.html) by ECB (one of the Annexes).
* [Transformer-based Entity Legal Form Classification](https://arxiv.org/pdf/2310.12766) by Arimond et al. (2023).

49 changes: 49 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "fingerprints"
version = "1.2.3"
description = "A library to generate entity fingerprints."
readme = "README.md"
license = { file = "LICENSE" }
authors = [{ name = "OpenSanctions", email = "[email protected]" }]
classifiers = [
"Intended Audience :: Developers",
"Operating System :: OS Independent",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]
requires-python = ">= 3.10"
dependencies = ["normality >= 2.5.0, < 3.0.0"]

[project.urls]
Documentation = "https://github.com/opensanctions/fingerprints/"
Repository = "https://github.com/opensanctions/fingerprints.git"
Issues = "https://github.com/opensanctions/fingerprints/issues"

[project.optional-dependencies]
dev = [
"bump2version",
"pyyaml >= 5.0.0, < 7.0.0",
"mypy",
"ruff",
"build",
"pytest",
"pytest-cov",
"types-PyYAML",
"coverage>=4.1",
]

[project.entry-points."babel.extractors"]

[tool.hatch.build.targets.sdist]
only-include = ["fingerprints", "LICENSE", "README.md"]

[tool.distutils.bdist_wheel]
universal = true

[tool.coverage.run]
branch = true
5 changes: 0 additions & 5 deletions setup.cfg

This file was deleted.

45 changes: 0 additions & 45 deletions setup.py

This file was deleted.

42 changes: 42 additions & 0 deletions tools/statement_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import sys
import csv
from collections import Counter

from fingerprints.cleanup import clean_name_light
from fingerprints.types import remove_types


def main(file_name):
counter = Counter()
with open(file_name, "r") as fh:
for idx, row in enumerate(csv.DictReader(fh)):
if idx % 100_000 == 0:
print(idx)
if row["prop_type"] != "name":
continue
schema = row["schema"]
# if schema not in ("Organization", "Company"):
if schema != "Person":
continue
value = row["value"]
clean = clean_name_light(value)
if clean is None:
continue
# clean = remove_types(clean)
# if clean is None:
# continue
tokens = clean.split(" ")
for token in tokens:
if len(token) > 2:
counter[token] += 1
# print((value, clean))

# if idx > 1_000_000:
# break

for token, count in counter.most_common(1000):
print((token, count))


if __name__ == "__main__":
main(sys.argv[1])

0 comments on commit 8d0790a

Please sign in to comment.