diff --git a/docs/lahman.md b/docs/lahman.md index 8f219f69..5f094088 100644 --- a/docs/lahman.md +++ b/docs/lahman.md @@ -1,10 +1,10 @@ # Lahman Data Acquisition Functions -Pull data from [Sean Lahman's database](http://www.seanlahman.com/baseball-archive/statistics/), also hosted by [Chadwick Bureau on GitHub](https://github.com/chadwickbureau/baseballdatabank) -- our new source -- using the following functions: +Pulls data linked from [Sean Lahman's database](http://seanlahman.com/) now hosted on dropbox -- using the following functions: ```python from pybaseball.lahman import * -download_lahman() #download the entire lahman database to your current working directory +download_lahman() # a table of all player biographical info and ids people = people() @@ -81,7 +81,7 @@ schools = schools() series_post = series_post() # data on teams by year: record, division, stadium, attendance, etc -teams = teams() +teams = teams_core() # current and historical franchises, whether they're still active, and their ids teams_franchises = teams_franchises() diff --git a/pybaseball/__init__.py b/pybaseball/__init__.py index cc223855..852c0220 100644 --- a/pybaseball/__init__.py +++ b/pybaseball/__init__.py @@ -79,7 +79,6 @@ from .lahman import schools from .lahman import series_post from .lahman import teams_core -from .lahman import teams_upstream from .lahman import teams_franchises from .lahman import teams_half from .lahman import download_lahman diff --git a/pybaseball/lahman.py b/pybaseball/lahman.py index 437096eb..0d7c0ab9 100644 --- a/pybaseball/lahman.py +++ b/pybaseball/lahman.py @@ -1,136 +1,161 @@ +from datetime import timedelta from io import BytesIO +from os import makedirs from os import path -from typing import Optional -from zipfile import ZipFile +from bs4 import BeautifulSoup import pandas as pd +from pathlib import Path +from py7zr import SevenZipFile import requests +from requests_cache import CachedSession from . import cache -url = "https://github.com/chadwickbureau/baseballdatabank/archive/master.zip" -base_string = "baseballdatabank-master" - -_handle = None - -def get_lahman_zip() -> Optional[ZipFile]: - # Retrieve the Lahman database zip file, returns None if file already exists in cwd. - # If we already have the zip file, keep re-using that. - # Making this a function since everything else will be re-using these lines - global _handle - if path.exists(path.join(cache.config.cache_directory, base_string)): - _handle = None - elif not _handle: - s = requests.get(url, stream=True) - _handle = ZipFile(BytesIO(s.content)) - return _handle - -def download_lahman(): - # download entire lahman db to present working directory - z = get_lahman_zip() - if z is not None: - z.extractall(cache.config.cache_directory) - z = get_lahman_zip() - # this way we'll now start using the extracted zip directory - # instead of the session ZipFile object - -def _get_file(tablename: str, quotechar: str = "'") -> pd.DataFrame: - z = get_lahman_zip() - f = f'{base_string}/{tablename}' +# NB: response will be cached for 30 days unless force is True +def _get_response(force:bool=False) -> requests.Response: + session = _get_session() + response = session.get("http://seanlahman.com", refresh=force) + return response + +# For example, "https://www.dropbox.com/scl/fi/hy0sxw6gaai7ghemrshi8/lahman_1871-2023_csv.7z?rlkey=edw1u63zzxg48gvpcmr3qpnhz&dl=1" +def _get_download_url(force:bool=False) -> str: + response = _get_response(force) + soup = BeautifulSoup(response.content, "html.parser") + + anchor = soup.find("a", string="Comma-delimited version") + url = anchor["href"].replace("dl=0", "dl=1") + + return url + +def _get_cache_dir() -> str: + return f"{cache.config.cache_directory}/lahman" + +def _get_session() -> CachedSession: + return CachedSession(_get_cache_dir(), expire_after=timedelta(days=30)) + +def _get_base_string() -> str: + url = _get_download_url() + path = Path(url) + + return path.stem + +def _get_file_path(filename: str = "") -> str: + base_string = _get_base_string() + return path.join(_get_cache_dir(), base_string, filename) + +def _get_table(filename: str, + quotechar: str = "'", + encoding=None, + on_bad_lines="error") -> pd.DataFrame: + filepath = _get_file_path(filename) data = pd.read_csv( - f"{path.join(cache.config.cache_directory, f)}" if z is None else z.open(f), + filepath, header=0, - sep=',', - quotechar=quotechar + sep=",", + quotechar=quotechar, + encoding=encoding, + on_bad_lines=on_bad_lines, ) return data +# Return whether download happened (True) or if cache used (False) +def download_lahman(force: bool = False) -> bool: + if force or not path.exists(_get_file_path()): + cache_dir = _get_cache_dir() + base_string = _get_base_string() + makedirs(f"{cache_dir}/{base_string}", exist_ok=True) -# do this for every table in the lahman db so they can exist as separate functions -def parks() -> pd.DataFrame: - return _get_file('core/Parks.csv') + url = _get_download_url(force) + stream = requests.get(url, stream=True) + with SevenZipFile(BytesIO(stream.content)) as zip: + zip.extractall(cache_dir) + return True + return False +# do this for every table in the lahman db so they can exist as separate functions def all_star_full() -> pd.DataFrame: - return _get_file("core/AllstarFull.csv") + return _get_table("AllstarFull.csv") def appearances() -> pd.DataFrame: - return _get_file("core/Appearances.csv") + return _get_table("Appearances.csv") def awards_managers() -> pd.DataFrame: - return _get_file("contrib/AwardsManagers.csv") + return _get_table("AwardsManagers.csv") def awards_players() -> pd.DataFrame: - return _get_file("contrib/AwardsPlayers.csv") + return _get_table("AwardsPlayers.csv") def awards_share_managers() -> pd.DataFrame: - return _get_file("contrib/AwardsShareManagers.csv") + return _get_table("AwardsShareManagers.csv") def awards_share_players() -> pd.DataFrame: - return _get_file("contrib/AwardsSharePlayers.csv") + return _get_table("AwardsSharePlayers.csv") def batting() -> pd.DataFrame: - return _get_file("core/Batting.csv") + return _get_table("Batting.csv") def batting_post() -> pd.DataFrame: - return _get_file("core/BattingPost.csv") + return _get_table("BattingPost.csv") def college_playing() -> pd.DataFrame: - return _get_file("contrib/CollegePlaying.csv") + return _get_table("CollegePlaying.csv") def fielding() -> pd.DataFrame: - return _get_file("core/Fielding.csv") + return _get_table("Fielding.csv") def fielding_of() -> pd.DataFrame: - return _get_file("core/FieldingOF.csv") + return _get_table("FieldingOF.csv") def fielding_of_split() -> pd.DataFrame: - return _get_file("core/FieldingOFsplit.csv") + return _get_table("FieldingOFsplit.csv") def fielding_post() -> pd.DataFrame: - return _get_file("core/FieldingPost.csv") + return _get_table("FieldingPost.csv") def hall_of_fame() -> pd.DataFrame: - return _get_file("contrib/HallOfFame.csv") + return _get_table("HallOfFame.csv") def home_games() -> pd.DataFrame: - return _get_file("core/HomeGames.csv") + return _get_table("HomeGames.csv") def managers() -> pd.DataFrame: - return _get_file("core/Managers.csv") + return _get_table("Managers.csv") def managers_half() -> pd.DataFrame: - return _get_file("core/ManagersHalf.csv") + return _get_table("ManagersHalf.csv") def master() -> pd.DataFrame: # Alias for people -- the new name for master return people() +def parks() -> pd.DataFrame: + return _get_table("Parks.csv", encoding="unicode_escape") + def people() -> pd.DataFrame: - return _get_file("core/People.csv") + return _get_table("People.csv", encoding="unicode_escape") def pitching() -> pd.DataFrame: - return _get_file("core/Pitching.csv") + return _get_table("Pitching.csv") def pitching_post() -> pd.DataFrame: - return _get_file("core/PitchingPost.csv") + return _get_table("PitchingPost.csv") def salaries() -> pd.DataFrame: - return _get_file("contrib/Salaries.csv") + return _get_table("Salaries.csv") def schools() -> pd.DataFrame: - return _get_file("contrib/Schools.csv", quotechar='"') # different here bc of doublequotes used in some school names + # NB: one line is bad; "brklyncuny" should use double quotes, but doesn't + return _get_table("Schools.csv", quotechar='"', on_bad_lines="skip") def series_post() -> pd.DataFrame: - return _get_file("core/SeriesPost.csv") + return _get_table("SeriesPost.csv") def teams_core() -> pd.DataFrame: - return _get_file("core/Teams.csv") - -def teams_upstream() -> pd.DataFrame: - return _get_file("upstream/Teams.csv") # manually maintained file + return _get_table("Teams.csv") def teams_franchises() -> pd.DataFrame: - return _get_file("core/TeamsFranchises.csv") + return _get_table("TeamsFranchises.csv") def teams_half() -> pd.DataFrame: - return _get_file("core/TeamsHalf.csv") + return _get_table("TeamsHalf.csv") diff --git a/setup.py b/setup.py index 2d76831a..d1aacf1d 100644 --- a/setup.py +++ b/setup.py @@ -92,6 +92,8 @@ 'matplotlib>=2.0.0', 'tqdm>=4.50.0', 'attrs>=20.3.0', + 'py7zr>=0.22.0', + 'requests_cache>=1.2.1', ], # List additional groups of dependencies here (e.g. development diff --git a/tests/pybaseball/conftest.py b/tests/pybaseball/conftest.py index 6c9a845a..ea8d66d7 100644 --- a/tests/pybaseball/conftest.py +++ b/tests/pybaseball/conftest.py @@ -131,83 +131,70 @@ def get_contents(filename: str) -> str: return get_contents - @pytest.fixture() -def get_data_file_dataframe(data_dir: str) -> GetDataFrameCallable: +def get_data_file_bytes(data_dir: str) -> Callable[[str], bytes]: """ - Returns a function that will allow getting a dataframe from a csv file in the tests data directory easily + Returns a function that will allow getting the contents of a file in the tests data directory easily """ - def get_dataframe(filename: str, parse_dates: _ParseDates = False) -> pd.DataFrame: + def get_bytes(filename: str) -> bytes: """ - Get the DatFrame representation of the contents of a csv file in the tests data directory + Get the byte contents of a file in the tests data directory ARGUMENTS: - filename : str : the name of the file within the tests data directory to load into a DataFrame + filename : str : the name of the file within the tests data directory to get the contents of """ - return pd.read_csv(os.path.join(data_dir, filename), index_col=0, parse_dates=parse_dates).reset_index(drop=True).convert_dtypes(convert_string=False) - - return get_dataframe + with open(os.path.join(data_dir, filename), 'rb') as _file: + data = _file.read() + return data + return get_bytes @pytest.fixture() -def response_get_monkeypatch(monkeypatch: MonkeyPatch) -> Callable: +def get_data_file_dataframe(data_dir: str) -> GetDataFrameCallable: """ - Returns a function that will monkeypatch the requests.get function call to return expected data + Returns a function that will allow getting a dataframe from a csv file in the tests data directory easily """ - def setup(result: Union[str, bytes], expected_url: Optional[str] = None) -> None: + def get_dataframe(filename: str, parse_dates: _ParseDates = False) -> pd.DataFrame: """ - Get the DatFrame representation of the contents of a csv file in the tests data directory + Get the DatFrame representation of the contents of a csv file in the tests data directory ARGUMENTS: - result : str : the payload to return in the contents of the request.get call - expected_url : str (optional) : an expected_url to test the get call against - to ensure the correct endpoint is hit + filename : str : the name of the file within the tests data directory to load into a DataFrame """ - def _monkeypatch(url: str, params: Optional[Dict] = None, timeout: Optional[int] = None) -> object: - final_url = url - - if params: - query_params = urllib.parse.urlencode(params, safe=',') - final_url = f"{final_url}?{query_params}" - - if expected_url is not None: - # These prints are desired as these are long and get cut off in the test outpute. - # These will only render on failed tests, so only when you would want to see them anyway. - print("expected", expected_url) - print("received", final_url) - assert final_url.endswith(expected_url) - - class DummyResponse: - def __init__(self, content: Union[str, bytes]): - self.content = content - self.text = content - self.status_code = 200 - self.url = final_url + return pd.read_csv(os.path.join(data_dir, filename), index_col=0, parse_dates=parse_dates).reset_index(drop=True).convert_dtypes(convert_string=False) - return DummyResponse(result) + return get_dataframe - monkeypatch.setattr(requests, 'get', _monkeypatch) - return setup +@pytest.fixture() +def response_get_monkeypatch(monkeypatch: MonkeyPatch) -> Callable: + return _get_monkeypatch(monkeypatch, requests) @pytest.fixture() def bref_get_monkeypatch(monkeypatch: MonkeyPatch) -> Callable: + return _get_monkeypatch(monkeypatch, BRefSession()) + +@pytest.fixture() +def target_get_monkeypatch(monkeypatch: MonkeyPatch, target: str | object) -> Callable: + return _get_monkeypatch(monkeypatch, target) + +def _get_monkeypatch(monkeypatch: MonkeyPatch, target: str | object) -> Callable: """ - Returns a function that will monkeypatch the BRefSession.get function call to return expected data + Returns a function that will monkeypatch the input target's get() function call to return supplied result. """ def setup(result: Union[str, bytes], expected_url: Optional[str] = None) -> None: """ - Get the DatFrame representation of the contents of a csv file in the tests data directory + Get the result when calling the get() function ARGUMENTS: - result : str : the payload to return in the contents of the request.get call + result : str | bytes : the payload to return in the contents of the request.get call expected_url : str (optional) : an expected_url to test the get call against to ensure the correct endpoint is hit """ - def _monkeypatch(url: str, params: Optional[Dict] = None, timeout: Optional[int] = None) -> object: + def _monkeypatch(url: str, params: Optional[Dict] = None, stream = False, timeout: Optional[int] = None) -> object: final_url = url if params: @@ -230,6 +217,6 @@ def __init__(self, content: Union[str, bytes]): return DummyResponse(result) - monkeypatch.setattr(BRefSession(), 'get', _monkeypatch) + monkeypatch.setattr(target, 'get', _monkeypatch) return setup diff --git a/tests/pybaseball/data/lahman.html b/tests/pybaseball/data/lahman.html new file mode 100644 index 00000000..adab564d --- /dev/null +++ b/tests/pybaseball/data/lahman.html @@ -0,0 +1,26 @@ + +SeanLahman.com

Lahman Baseball Database
Updated version of the Lahman Database is now available.  Follow me on Twitter to be notified of updates.
Hats off to Bryan Walko for his efforts to produce this annual update.

Download latest version (stats from 1871-2023 seasons)

Files are compressed using 7-zip.  Click here for free download.

The updated version of the database contains complete batting and pitching statistics from 1871 to 2023, plus fielding statistics, standings, team stats, managerial records, post-season data, and more. For more details on the latest release, please read the documentation.

The database can be used on any platform, but please be aware that this is not a standalone application. It is a database that requires Microsoft Access or some other relational database software to be useful.

Journalism
From 2010-2023, I was a watchdog reporter for the Rochester Democrat and Chronicle, part of the USA Today Network. I wrote about public safety trends and the impact of technology in Rochester and for other Gannett papers across New York state, and participate in national projects with the USA Today investigations team.  Prior to that I was a sports reporter and columnist with the New York Sun.
Other data projects
\ No newline at end of file diff --git a/tests/pybaseball/data/lahman_1871-2023_csv.7z b/tests/pybaseball/data/lahman_1871-2023_csv.7z new file mode 100644 index 00000000..64406750 Binary files /dev/null and b/tests/pybaseball/data/lahman_1871-2023_csv.7z differ diff --git a/tests/pybaseball/test_lahman.py b/tests/pybaseball/test_lahman.py new file mode 100644 index 00000000..dc6541d9 --- /dev/null +++ b/tests/pybaseball/test_lahman.py @@ -0,0 +1,113 @@ +import tempfile +import time +from typing import Callable + +import pytest + +from pybaseball.lahman import * +from pybaseball.lahman import _get_base_string, _get_download_url, _get_response, _get_session + + +@pytest.fixture(name="sample_html") +def _sample_html(get_data_file_contents: Callable[[str], str]) -> str: + return get_data_file_contents('lahman.html') + +@pytest.fixture(name="sample_bytes") +def _sample_bytes(get_data_file_bytes: Callable[[str], bytes]) -> bytes: + return get_data_file_bytes('lahman_1871-2023_csv.7z') + +@pytest.fixture(name="target") +def _target() -> str: + return _get_session() + +@pytest.fixture(autouse=True) +def run_around_tests(): + # setup + tempdir = tempfile.TemporaryDirectory() + cache.config.cache_directory = tempdir.name + yield + # teardown + pass + +def test_get_lahman_info(target_get_monkeypatch: Callable, sample_html: str): + target_get_monkeypatch(sample_html) + + url = _get_download_url() + base_string = _get_base_string() + + assert(url == "https://www.dropbox.com/scl/fi/hy0sxw6gaai7ghemrshi8/lahman_1871-2023_csv.7z?rlkey=edw1u63zzxg48gvpcmr3qpnhz&dl=1") + assert(base_string == "lahman_1871-2023_csv") + +def test_download_lahman(target_get_monkeypatch: Callable, sample_html: str, + response_get_monkeypatch: Callable, sample_bytes: bytes): + target_get_monkeypatch(sample_html) + response_get_monkeypatch(sample_bytes) + + # test download + b1 = download_lahman() + r1 = _get_response() + assert b1 + + # test download - no force + b2 = download_lahman() + r2 = _get_response() + assert not b2 + assert r2.created_at == r1.created_at + assert r2.expires == r1.expires + + # test download - with force + time.sleep(1.1) + b3 = download_lahman(force=True) + r3 = _get_response(force=True) + assert b3 + assert r3.created_at == r1.created_at + assert r3.expires > r1.expires + +def test_lahman_tables(target_get_monkeypatch: Callable, sample_html: str, + response_get_monkeypatch: Callable, sample_bytes: bytes): + target_get_monkeypatch(sample_html) + response_get_monkeypatch(sample_bytes) + + download_lahman() + + # test tables + assert not all_star_full().empty + assert not appearances().empty + assert not awards_managers().empty + assert not awards_players().empty + assert not awards_share_managers().empty + assert not awards_share_players().empty + assert not batting().empty + assert not batting_post().empty + assert not college_playing().empty + assert not fielding().empty + assert not fielding_of().empty + assert not fielding_of_split().empty + assert not fielding_post().empty + assert not hall_of_fame().empty + assert not home_games().empty + assert not managers().empty + assert not managers_half().empty + assert not master().empty + assert not parks().empty + assert not people().empty + assert not pitching().empty + assert not pitching_post().empty + assert not salaries().empty + assert not schools().empty + assert not series_post().empty + assert not teams_core().empty + assert not teams_franchises().empty + assert not teams_half().empty + +def test_lahman_schools(target_get_monkeypatch: Callable, sample_html: str, + response_get_monkeypatch: Callable, sample_bytes: bytes): + target_get_monkeypatch(sample_html) + response_get_monkeypatch(sample_bytes) + + download_lahman() + + table = schools() + row = table.loc[table['schoolID'] == "ksstmaC"].iloc[0] + name = row['name_full'] + assert name == "St. Mary's College"