From 9916b04a8cd643eba2a9843e5187baa3bd6c31c2 Mon Sep 17 00:00:00 2001 From: Valentin Porchet Date: Sat, 9 Nov 2024 10:44:47 +0100 Subject: [PATCH] feat: replaced Beautiful Soup by selectolax to enhance performance (#213) --- README.md | 2 +- app/helpers.py | 8 +- app/heroes/parsers/hero_parser.py | 112 ++++----- app/heroes/parsers/heroes_parser.py | 12 +- app/main.py | 2 +- app/parsers.py | 21 +- app/players/helpers.py | 10 +- app/players/parsers/base_player_parser.py | 2 +- app/players/parsers/player_career_parser.py | 237 +++++++----------- .../parsers/player_career_stats_parser.py | 2 +- .../parsers/player_stats_summary_parser.py | 2 +- app/roles/parsers/roles_parser.py | 22 +- pyproject.toml | 4 +- .../parsers/test_player_career_parser.py | 37 +-- tests/players/test_players_helpers.py | 4 +- uv.lock | 53 ++-- 16 files changed, 209 insertions(+), 321 deletions(-) diff --git a/README.md b/README.md index 0461f496..866179a4 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ [![License: MIT](https://img.shields.io/github/license/TeKrop/overfast-api)](https://github.com/TeKrop/overfast-api/blob/master/LICENSE) ![Mockup OverFast API](https://files.tekrop.fr/overfast_api_logo_full_1000.png) -> OverFast API provides comprehensive data on Overwatch 2 heroes, game modes, maps, and player statistics by scraping Blizzard pages. Developed with the efficiency of **FastAPI** and **Beautiful Soup**, it leverages **nginx** as a reverse proxy and **Redis** for caching. Its tailored caching mechanism significantly reduces calls to Blizzard pages, ensuring swift and precise data delivery to users. +> OverFast API provides comprehensive data on Overwatch 2 heroes, game modes, maps, and player statistics by scraping Blizzard pages. Developed with the efficiency of **FastAPI** and **Selectolax**, it leverages **nginx** as a reverse proxy and **Redis** for caching. Its tailored caching mechanism significantly reduces calls to Blizzard pages, ensuring swift and precise data delivery to users. ## Table of contents * [✨ Live instance](#-live-instance) diff --git a/app/helpers.py b/app/helpers.py index fa0357a4..b4aa1ab8 100644 --- a/app/helpers.py +++ b/app/helpers.py @@ -54,8 +54,12 @@ def overfast_internal_error(url: str, error: Exception) -> HTTPException: str(error), ) - # If Discord Webhook configuration is enabled, send a message to the - # given channel using Discord Webhook URL + # If we're using a profiler, it means we're debugging, raise the error + # directly in order to have proper backtrace in logs + if settings.profiler: + raise error + + # Else, send a message to the given channel using Discord Webhook URL send_discord_webhook_message( f"* **URL** : {url}\n" f"* **Error type** : {type(error).__name__}\n" diff --git a/app/heroes/parsers/hero_parser.py b/app/heroes/parsers/hero_parser.py index 7df24547..74b04bbf 100644 --- a/app/heroes/parsers/hero_parser.py +++ b/app/heroes/parsers/hero_parser.py @@ -3,8 +3,8 @@ import re from typing import ClassVar -from bs4 import Tag from fastapi import status +from selectolax.lexbor import LexborNode from app.config import settings from app.enums import Locale @@ -34,19 +34,16 @@ def get_blizzard_url(self, **kwargs) -> str: def parse_data(self) -> dict: # We must check if we have the expected section for hero. If not, # it means the hero hasn't been found and/or released yet. - if not self.root_tag.find("div", class_="abilities-container", recursive=False): + if not ( + abilities_section := self.root_tag.css_first("div.abilities-container") + ): raise ParserBlizzardError( status_code=status.HTTP_404_NOT_FOUND, message="Hero not found or not released yet", ) - overview_section = self.root_tag.find("blz-page-header", recursive=False) - abilities_section = self.root_tag.find( - "div", - class_="abilities-container", - recursive=False, - ) - lore_section = self.root_tag.find("blz-section", class_="lore", recursive=False) + overview_section = self.root_tag.css_first("blz-page-header") + lore_section = self.root_tag.css_first("blz-section.lore") return { **self.__get_summary(overview_section), @@ -54,20 +51,20 @@ def parse_data(self) -> dict: "story": self.__get_story(lore_section), } - def __get_summary(self, overview_section: Tag) -> dict: - header_section = overview_section.find("blz-header") - extra_list_items = overview_section.find("blz-list").find_all("blz-list-item") + def __get_summary(self, overview_section: LexborNode) -> dict: + header_section = overview_section.css_first("blz-header") + extra_list_items = overview_section.css_first("blz-list").css("blz-list-item") birthday, age = self.__get_birthday_and_age( - text=extra_list_items[2].get_text(), locale=self.locale + text=extra_list_items[2].css_first("p").text(), locale=self.locale ) return { - "name": header_section.find("h2").get_text(), - "description": ( - header_section.find("p", slot="description").get_text().strip() + "name": header_section.css_first("h2").text(), + "description": header_section.css_first("p").text().strip(), + "role": get_role_from_icon_url( + extra_list_items[0].css_first("image").attributes.get("href") ), - "role": get_role_from_icon_url(extra_list_items[0].find("image")["href"]), - "location": extra_list_items[1].get_text().strip(), + "location": extra_list_items[1].text().strip(), "birthday": birthday, "age": age, } @@ -109,85 +106,75 @@ def __get_birthday_and_age( return birthday, age @staticmethod - def __get_abilities(abilities_section: Tag) -> list[dict]: - abilities_list_div = abilities_section.find( - "blz-carousel-section", - recursive=False, - ).find("blz-carousel", recursive=False) + def __get_abilities(abilities_section: LexborNode) -> list[dict]: + carousel_section_div = abilities_section.css_first("blz-carousel-section") + abilities_list_div = carousel_section_div.css_first("blz-carousel") abilities_desc = [ ( - desc_div.find("blz-header") - .find("span") - .get_text() + desc_div.css_first("blz-header span") + .text() .strip() .replace("\r", "") .replace("\n", " ") ) - for desc_div in abilities_list_div.find_all("blz-feature") + for desc_div in abilities_list_div.css("blz-feature") ] abilities_videos = [ { - "thumbnail": video_div["poster"], + "thumbnail": video_div.attributes["poster"], "link": { - "mp4": video_div["mp4"], - "webm": video_div["webm"], + "mp4": video_div.attributes["mp4"], + "webm": video_div.attributes["webm"], }, } - for video_div in abilities_section.find( - "blz-carousel-section", - recursive=False, - ).find_all("blz-video", recursive=False) + for video_div in carousel_section_div.css("blz-video") ] return [ { - "name": ability_div["label"], + "name": ability_div.attributes["label"], "description": abilities_desc[ability_index].strip(), - "icon": ability_div.find("blz-image")["src"], + "icon": ability_div.css_first("blz-image").attributes["src"], "video": abilities_videos[ability_index], } for ability_index, ability_div in enumerate( - abilities_list_div.find("blz-tab-controls").find_all("blz-tab-control"), + abilities_list_div.css_first("blz-tab-controls").css("blz-tab-control"), ) ] - def __get_story(self, lore_section: Tag) -> dict: - showcase_section = lore_section.find("blz-showcase", recursive=False) + def __get_story(self, lore_section: LexborNode) -> dict: + showcase_section = lore_section.css_first("blz-showcase") return { "summary": ( - showcase_section.find("blz-header") - .find("p") - .get_text() + showcase_section.css_first("blz-header p") + .text() .strip() .replace("\n", "") ), "media": self.__get_media(showcase_section), "chapters": self.__get_story_chapters( - lore_section.find("blz-accordion-section", recursive=False).find( - "blz-accordion", - recursive=False, - ), + lore_section.css_first("blz-accordion-section blz-accordion") ), } - def __get_media(self, showcase_section: Tag) -> dict | None: - if video := showcase_section.find("blz-video"): + def __get_media(self, showcase_section: LexborNode) -> dict | None: + if video := showcase_section.css_first("blz-video"): return { "type": MediaType.VIDEO, - "link": f"https://youtu.be/{video['youtube-id']}", + "link": f"https://youtu.be/{video.attributes['youtube-id']}", } - if button := showcase_section.find("blz-button"): + if button := showcase_section.css_first("blz-button"): return { "type": ( MediaType.SHORT_STORY - if button["analytics-label"] == "short-story" + if button.attributes["analytics-label"] == "short-story" else MediaType.COMIC ), - "link": self.__get_full_url(button["href"]), + "link": self.__get_full_url(button.attributes["href"]), } return None @@ -199,33 +186,24 @@ def __get_full_url(url: str) -> str: return f"{settings.blizzard_host}{url}" if url.startswith("/") else url @staticmethod - def __get_story_chapters(accordion: Tag) -> list[dict]: + def __get_story_chapters(accordion: LexborNode) -> list[dict]: chapters_content = [ ( " ".join( - [ - paragraph.get_text() - for paragraph in content_container.find_all(["p", "pr"]) - ], + [paragraph.text() for paragraph in content_container.css("p,pr")], ).strip() ) - for content_container in accordion.find_all( - "div", - slot="content", - recursive=False, - ) + for content_container in accordion.css("div[slot=content]") ] chapters_picture = [ - picture["src"] for picture in accordion.find_all("blz-image") + picture.attributes["src"] for picture in accordion.css("blz-image") ] return [ { - "title": title_span.get_text().capitalize().strip(), + "title": title_span.text().capitalize().strip(), "content": chapters_content[title_index], "picture": chapters_picture[title_index], } - for title_index, title_span in enumerate( - accordion.find_all("span", recursive=False), - ) + for title_index, title_span in enumerate(accordion.css("span")) ] diff --git a/app/heroes/parsers/heroes_parser.py b/app/heroes/parsers/heroes_parser.py index 75eb61da..279c7169 100644 --- a/app/heroes/parsers/heroes_parser.py +++ b/app/heroes/parsers/heroes_parser.py @@ -13,14 +13,12 @@ def parse_data(self) -> list[dict]: return sorted( [ { - "key": hero["data-hero-id"], - "name": hero["hero-name"], - "portrait": hero.find("blz-image")["src"], - "role": hero["data-role"], + "key": hero.attributes["data-hero-id"], + "name": hero.attributes["hero-name"], + "portrait": hero.css_first("blz-image").attributes["src"], + "role": hero.attributes["data-role"], } - for hero in self.root_tag.find("blz-media-gallery").find_all( - "blz-hero-card", - ) + for hero in self.root_tag.css("blz-media-gallery blz-hero-card") ], key=lambda hero: hero["key"], ) diff --git a/app/main.py b/app/main.py index dc8ed223..393ee426 100644 --- a/app/main.py +++ b/app/main.py @@ -48,7 +48,7 @@ async def lifespan(_: FastAPI): # pragma: no cover app = FastAPI(title="OverFast API", docs_url=None, redoc_url=None, lifespan=lifespan) description = f"""OverFast API provides comprehensive data on Overwatch 2 heroes, game modes, maps, and player statistics by scraping Blizzard pages. Developed with -the efficiency of **FastAPI** and **Beautiful Soup**, it leverages **nginx** as a +the efficiency of **FastAPI** and **Selectolax**, it leverages **nginx** as a reverse proxy and **Redis** for caching. Its tailored caching mechanism significantly reduces calls to Blizzard pages, ensuring swift and precise data delivery to users. diff --git a/app/parsers.py b/app/parsers.py index 3b40c5f8..35463030 100644 --- a/app/parsers.py +++ b/app/parsers.py @@ -2,8 +2,8 @@ from typing import ClassVar import httpx -from bs4 import BeautifulSoup from fastapi import status +from selectolax.lexbor import LexborHTMLParser from .cache_manager import CacheManager from .config import settings @@ -134,24 +134,15 @@ def parse_response_data(self) -> None: class HTMLParser(APIParser): - @property - def root_tag_params(self) -> dict: - """Returns the BeautifulSoup params kwargs, used to find the root Tag - on the page which will be used for searching data. - """ - return {"name": "main", "class_": "main-content", "recursive": False} - def store_response_data(self, response: httpx.Response) -> None: - """Initialize BeautifulSoup object with Blizzard response""" - self.create_bs_tag(response.text) + """Initialize parser tag with Blizzard response""" + self.create_parser_tag(response.text) - def create_bs_tag(self, html_content: str) -> None: - self.root_tag = BeautifulSoup(html_content, "lxml").body.find( - **self.root_tag_params, - ) + def create_parser_tag(self, html_content: str) -> None: + self.root_tag = LexborHTMLParser(html_content).css_first("main") class JSONParser(APIParser): def store_response_data(self, response: httpx.Response) -> None: - """Initialize BeautifulSoup object with Blizzard response""" + """Initialize object with Blizzard response""" self.json_data = response.json() diff --git a/app/players/helpers.py b/app/players/helpers.py index bd5a7cfd..b19c8349 100644 --- a/app/players/helpers.py +++ b/app/players/helpers.py @@ -118,11 +118,13 @@ def get_role_key_from_icon(icon_url: str) -> CompetitiveRole: ) -def get_stats_hero_class(hero_classes: list[str]) -> str: +def get_stats_hero_class(hero_class: str) -> str: """Extract the specific classname from the classes list for a given hero.""" - return next( - classname for classname in hero_classes if classname.startswith("option-") - ) + start_index = hero_class.find("option-") + end_index = start_index + len("option-") + while end_index < len(hero_class) and hero_class[end_index].isdigit(): + end_index += 1 + return hero_class[start_index:end_index] def get_tier_from_icon(tier_url: str) -> int: diff --git a/app/players/parsers/base_player_parser.py b/app/players/parsers/base_player_parser.py index b9774349..28172c22 100644 --- a/app/players/parsers/base_player_parser.py +++ b/app/players/parsers/base_player_parser.py @@ -49,7 +49,7 @@ async def parse(self) -> None: == self.player_data["summary"]["lastUpdated"] ): logger.info("Player Cache found and up-to-date, using it") - self.create_bs_tag(player_cache["profile"]) + self.create_parser_tag(player_cache["profile"]) self.parse_response_data() return diff --git a/app/players/parsers/player_career_parser.py b/app/players/parsers/player_career_parser.py index 37036fc9..e3b0d187 100644 --- a/app/players/parsers/player_career_parser.py +++ b/app/players/parsers/player_career_parser.py @@ -2,8 +2,8 @@ from typing import ClassVar -from bs4 import Tag from fastapi import status +from selectolax.lexbor import LexborNode from app.config import settings from app.exceptions import ParserBlizzardError @@ -144,7 +144,7 @@ def _filter_all_stats_data(self) -> dict: def parse_data(self) -> dict: # We must check if we have the expected section for profile. If not, # it means the player doesn't exist or hasn't been found. - if not self.root_tag.find("blz-section", class_="Profile-masthead"): + if not self.root_tag.css_first("blz-section.Profile-masthead"): raise ParserBlizzardError( status_code=status.HTTP_404_NOT_FOUND, message="Player not found", @@ -157,32 +157,18 @@ def __get_summary(self) -> dict: if self.filters["stats"]: return {} - profile_div = self.root_tag.find( - "blz-section", - class_="Profile-masthead", - recursive=False, - ).find("div", class_="Profile-player", recursive=False) - summary_div = profile_div.find( - "div", - class_="Profile-player--summaryWrapper", - recursive=False, - ) - progression_div = profile_div.find( - "div", - class_="Profile-player--info", - recursive=False, + profile_div = self.root_tag.css_first( + "blz-section.Profile-masthead > div.Profile-player" ) + summary_div = profile_div.css_first("div.Profile-player--summaryWrapper") + progression_div = profile_div.css_first("div.Profile-player--info") return { - "username": str( - summary_div.find("h1", class_="Profile-player--name").contents[0] - ), + "username": summary_div.css_first("h1.Profile-player--name").text(), "avatar": ( - summary_div.find( - "img", - class_="Profile-player--portrait", - recursive=False, - ).get("src") + summary_div.css_first("img.Profile-player--portrait").attributes.get( + "src" + ) ), "namecard": self.__get_namecard_url(), "title": self.__get_title(profile_div), @@ -202,44 +188,35 @@ def __get_last_updated_at_value(self) -> int | None: ) @staticmethod - def __get_title(profile_div: Tag) -> str | None: + def __get_title(profile_div: LexborNode) -> str | None: # We return None is there isn't any player title div - if not ( - title_tag := profile_div.find( - "h2", - class_="Profile-player--title", - recursive=False, - ) - ): + if not (title_tag := profile_div.css_first("h2.Profile-player--title")): return None # Retrieve the title text - title = str(title_tag.contents[0]) or None + title = title_tag.text() or None # Special case : the "no title" means there is no title return get_player_title(title) @staticmethod - def __get_endorsement(progression_div: Tag) -> dict | None: - endorsement_span = progression_div.find( - "span", - class_="Profile-player--endorsementWrapper", - recursive=False, + def __get_endorsement(progression_div: LexborNode) -> dict | None: + endorsement_span = progression_div.css_first( + "span.Profile-player--endorsementWrapper" ) if not endorsement_span: return None - endorsement_frame_url = endorsement_span.find( - "img", - class_="Profile-playerSummary--endorsement", - )["src"] + endorsement_frame_url = endorsement_span.css_first( + "img.Profile-playerSummary--endorsement" + ).attributes["src"] return { "level": get_endorsement_value_from_frame(endorsement_frame_url), "frame": endorsement_frame_url, } - def __get_competitive_ranks(self, progression_div: Tag) -> dict | None: + def __get_competitive_ranks(self, progression_div: LexborNode) -> dict | None: competitive_ranks = { platform.value: self.__get_platform_competitive_ranks( progression_div, @@ -253,18 +230,13 @@ def __get_competitive_ranks(self, progression_div: Tag) -> dict | None: def __get_platform_competitive_ranks( self, - progression_div: Tag, + progression_div: LexborNode, platform_class: str, ) -> dict | None: last_season_played = self.__get_last_season_played(platform_class) - competitive_rank_div = progression_div.select_one( - f"div.Profile-playerSummary--rankWrapper.{platform_class}", - ) - role_wrappers = competitive_rank_div.find_all( - "div", - class_="Profile-playerSummary--roleWrapper", - recursive=False, + role_wrappers = progression_div.css( + f"div.Profile-playerSummary--rankWrapper.{platform_class} > div.Profile-playerSummary--roleWrapper", ) if not role_wrappers and not last_season_played: return None @@ -275,10 +247,11 @@ def __get_platform_competitive_ranks( role_icon = self.__get_role_icon(role_wrapper) role_key = get_role_key_from_icon(role_icon).value - rank_tier_icons = role_wrapper.find_all( - "img", class_="Profile-playerSummary--rank" + rank_tier_icons = role_wrapper.css("img.Profile-playerSummary--rank") + rank_icon, tier_icon = ( + rank_tier_icons[0].attributes["src"], + rank_tier_icons[1].attributes["src"], ) - rank_icon, tier_icon = rank_tier_icons[0]["src"], rank_tier_icons[1]["src"] competitive_ranks[role_key] = { "division": get_division_from_icon(rank_icon).value, @@ -297,33 +270,30 @@ def __get_platform_competitive_ranks( return competitive_ranks def __get_last_season_played(self, platform_class: str) -> int | None: - profile_section = self.root_tag.find( - "div", - class_=platform_class, - recursive=False, - ) - if not profile_section: + if not (profile_section := self.__get_profile_view_section(platform_class)): return None - statistics_section = profile_section.find( - "blz-section", - class_="competitive-view", - recursive=False, + statistics_section = profile_section.css_first( + "blz-section.stats.competitive-view" + ) + last_season_played = statistics_section.attributes.get( + "data-latestherostatrankseasonow2" ) - - last_season_played = statistics_section.get("data-latestherostatrankseasonow2") return int(last_season_played) if last_season_played else None + def __get_profile_view_section(self, platform_class: str) -> LexborNode: + return self.root_tag.css_first(f"div.Profile-view.{platform_class}") + @staticmethod - def __get_role_icon(role_wrapper: Tag) -> str: + def __get_role_icon(role_wrapper: LexborNode) -> str: """The role icon format may differ depending on the platform : img for PC players, svg for console players """ - if role_div := role_wrapper.find("div", class_="Profile-playerSummary--role"): - return role_div.find("img")["src"] + if role_div := role_wrapper.css_first("div.Profile-playerSummary--role"): + return role_div.css_first("img").attributes["src"] - role_svg = role_wrapper.find("svg", class_="Profile-playerSummary--role") - return role_svg.find("use")["xlink:href"] + role_svg = role_wrapper.css_first("svg.Profile-playerSummary--role") + return role_svg.css_first("use").attributes["xlink:href"] def get_stats(self) -> dict | None: # If the user filtered the page on summary, no need to parse the stats @@ -345,9 +315,7 @@ def __get_platform_stats( if self.filters["platform"] and self.filters["platform"] != platform: return None - statistics_section = self.root_tag.find( - "div", class_=platform_class, recursive=False - ) + statistics_section = self.__get_profile_view_section(platform_class) gamemodes_infos = { gamemode.value: self.__get_gamemode_infos(statistics_section, gamemode) for gamemode in PlayerGamemode @@ -359,7 +327,7 @@ def __get_platform_stats( def __get_gamemode_infos( self, - statistics_section: Tag, + statistics_section: LexborNode, gamemode: PlayerGamemode, ) -> dict | None: # If the user decided to filter on another gamemode, stop here @@ -369,77 +337,53 @@ def __get_gamemode_infos( if not statistics_section: return None - top_heroes_section = statistics_section.find( - "blz-section", - class_="Profile-heroSummary", - recursive=False, - ).find( - "div", - class_=gamemodes_div_mapping[gamemode], - recursive=False, + top_heroes_section = statistics_section.first_child.css_first( + f"div.{gamemodes_div_mapping[gamemode]}" ) # Check if we can find a select in the section. If not, it means there is # no data to show for this gamemode and platform, return nothing. - if not top_heroes_section.find("select"): + if not top_heroes_section.css_first("select"): return None - career_stats_section = statistics_section.find( - "blz-section", - class_=gamemodes_div_mapping[gamemode], - recursive=False, + career_stats_section = statistics_section.css_first( + f"blz-section.{gamemodes_div_mapping[gamemode]}" ) - return { "heroes_comparisons": self.__get_heroes_comparisons(top_heroes_section), "career_stats": self.__get_career_stats(career_stats_section), } - def __get_heroes_comparisons(self, top_heroes_section: Tag) -> dict: - categories = { - option["value"]: option["option-id"] - for option in ( - top_heroes_section.find( - "div", - class_="Profile-heroSummary--header", - recursive=False, - ) - .find("select", recursive=False) - .children - ) - if option.get("option-id") - } + def __get_heroes_comparisons(self, top_heroes_section: LexborNode) -> dict: + categories = self.__get_heroes_options(top_heroes_section) heroes_comparisons = { string_to_snakecase( - get_real_category_name(categories[category["data-category-id"]]), + get_real_category_name( + categories[category.attributes["data-category-id"]] + ), ): { "label": get_real_category_name( - categories[category["data-category-id"]], + categories[category.attributes["data-category-id"]], ), "values": [ { - # First div is "Profile-progressBar--bar" - "hero": progress_bar_container.contents[0]["data-hero-id"], - # Second div is "Profile-progressBar--textWrapper" + "hero": progress_bar_container.first_child.attributes[ + "data-hero-id" + ], "value": get_computed_stat_value( - # Second div is "Profile-progressBar-description" - str( - progress_bar_container.contents[1] - .contents[1] - .contents[0] - ), + progress_bar_container.last_child.last_child.text() ), } - for progress_bar in category.children - for progress_bar_container in progress_bar.children - if progress_bar_container.name == "div" + for progress_bar in category.iter() + for progress_bar_container in progress_bar.iter() + if progress_bar_container.tag == "div" ], } - for category in top_heroes_section.children + for category in top_heroes_section.iter() if ( - "Profile-progressBars" in category["class"] - and category["data-category-id"] in categories + "Profile-progressBars" in category.attributes["class"] + and category.attributes["data-category-id"] in categories ) } @@ -454,30 +398,19 @@ def __get_heroes_comparisons(self, top_heroes_section: Tag) -> dict: return heroes_comparisons - @staticmethod - def __get_career_stats(career_stats_section: Tag) -> dict: - heroes_options = { - f"option-{option['value']}": option["option-id"] - for option in ( - career_stats_section.find( - "div", - class_="Profile-heroSummary--header", - recursive=False, - ) - .find("select", recursive=False) - .children - ) - if option.get("option-id") - } + def __get_career_stats(self, career_stats_section: LexborNode) -> dict: + heroes_options = self.__get_heroes_options( + career_stats_section, key_prefix="option-" + ) career_stats = {} - for hero_container in career_stats_section.children: + for hero_container in career_stats_section.iter(): # Hero container should be span with "stats-container" class - if hero_container.name != "span": + if hero_container.tag != "span": continue - stats_hero_class = get_stats_hero_class(hero_container["class"]) + stats_hero_class = get_stats_hero_class(hero_container.attributes["class"]) # Sometimes, Blizzard makes some weird things and options don't # have any label, so we can't know for sure which hero it is about. @@ -489,12 +422,12 @@ def __get_career_stats(career_stats_section: Tag) -> dict: career_stats[hero_key] = [] # Hero container children are div with "category" class - for card_stat in hero_container.children: + for card_stat in hero_container.iter(): # Content div should be the only child ("content" class) - content_div = card_stat.contents[0] + content_div = card_stat.first_child # Label should be the first div within content ("header" class) - category_label = str(content_div.contents[0].contents[0].contents[0]) + category_label = content_div.first_child.first_child.text() career_stats[hero_key].append( { @@ -503,17 +436,17 @@ def __get_career_stats(career_stats_section: Tag) -> dict: "stats": [], }, ) - for stat_row in content_div.children: - if "stat-item" not in stat_row["class"]: + for stat_row in content_div.iter(): + if "stat-item" not in stat_row.attributes["class"]: continue - stat_name = str(stat_row.contents[0].contents[0]) + stat_name = stat_row.first_child.text() career_stats[hero_key][-1]["stats"].append( { "key": get_plural_stat_key(string_to_snakecase(stat_name)), "label": stat_name, "value": get_computed_stat_value( - str(stat_row.contents[1].contents[0]), + stat_row.last_child.text() ), }, ) @@ -525,3 +458,17 @@ def __get_career_stats(career_stats_section: Tag) -> dict: del career_stats[hero_key] return career_stats + + @staticmethod + def __get_heroes_options( + parent_section: LexborNode, key_prefix: str = "" + ) -> dict[str, str]: + return { + f"{key_prefix}{option.attributes['value']}": option.attributes["option-id"] + for option in ( + parent_section.css_first( + "div.Profile-heroSummary--header > select" + ).iter() + ) + if option.attributes.get("option-id") + } diff --git a/app/players/parsers/player_career_stats_parser.py b/app/players/parsers/player_career_stats_parser.py index 44a9274b..a75a6345 100644 --- a/app/players/parsers/player_career_stats_parser.py +++ b/app/players/parsers/player_career_stats_parser.py @@ -16,7 +16,7 @@ def filter_request_using_query(self, **_) -> dict: def parse_data(self) -> dict | None: # We must check if we have the expected section for profile. If not, # it means the player doesn't exist or hasn't been found. - if not self.root_tag.find("blz-section", class_="Profile-masthead"): + if not self.root_tag.css_first("blz-section.Profile-masthead"): raise ParserBlizzardError( status_code=status.HTTP_404_NOT_FOUND, message="Player not found", diff --git a/app/players/parsers/player_stats_summary_parser.py b/app/players/parsers/player_stats_summary_parser.py index 8543bc9a..6ae0dd43 100644 --- a/app/players/parsers/player_stats_summary_parser.py +++ b/app/players/parsers/player_stats_summary_parser.py @@ -75,7 +75,7 @@ def filter_request_using_query(self, **kwargs) -> dict: def parse_data(self) -> dict | None: # We must check if we have the expected section for profile. If not, # it means the player doesn't exist or hasn't been found. - if not self.root_tag.find("blz-section", class_="Profile-masthead"): + if not self.root_tag.css_first("blz-section.Profile-masthead"): raise ParserBlizzardError( status_code=status.HTTP_404_NOT_FOUND, message="Player not found", diff --git a/app/roles/parsers/roles_parser.py b/app/roles/parsers/roles_parser.py index d0532bda..0cb9727a 100644 --- a/app/roles/parsers/roles_parser.py +++ b/app/roles/parsers/roles_parser.py @@ -12,29 +12,25 @@ class RolesParser(HTMLParser): root_path = settings.home_path def parse_data(self) -> list[dict]: - roles_container = self.root_tag.find( - "div", - class_="homepage-features-heroes", - recursive=False, - ).find("blz-feature-carousel-section", recursive=False) + roles_container = self.root_tag.css_first( + "div.homepage-features-heroes blz-feature-carousel-section" + ) roles_icons = [ - role_icon_div.find("blz-image")["src"] - for role_icon_div in roles_container.find("blz-tab-controls").find_all( - "blz-tab-control", + role_icon_div.css_first("blz-image").attributes["src"] + for role_icon_div in roles_container.css_first("blz-tab-controls").css( + "blz-tab-control" ) ] return [ { "key": get_role_from_icon_url(roles_icons[role_index]), - "name": role_div.find("blz-header").find("h3").get_text().capitalize(), + "name": role_div.css_first("blz-header h3").text().capitalize(), "icon": roles_icons[role_index], - "description": ( - role_div.find("blz-header").find("div").get_text().strip() - ), + "description": (role_div.css_first("blz-header div").text().strip()), } for role_index, role_div in list( - enumerate(roles_container.find_all("blz-feature")), + enumerate(roles_container.css("blz-feature")) )[:3] ] diff --git a/pyproject.toml b/pyproject.toml index a00cb3ec..5306bf2c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "overfast-api" -version = "3.2.1" +version = "3.3.0" description = "Overwatch API giving data about heroes, maps, and players statistics." license = {file = "LICENSE"} authors = [ @@ -9,7 +9,6 @@ authors = [ readme = "README.md" requires-python = ">=3.12" dependencies = [ - "beautifulsoup4==4.12.*", "fastapi[standard]==0.115.*", "httpx[http2]==0.27.*", "loguru==0.7.*", @@ -17,6 +16,7 @@ dependencies = [ "redis==5.2.*", "pydantic==2.9.*", "pydantic-settings==2.6.*", + "selectolax==0.3.*", ] [project.urls] diff --git a/tests/players/parsers/test_player_career_parser.py b/tests/players/parsers/test_player_career_parser.py index 1e28edbe..e3823a36 100644 --- a/tests/players/parsers/test_player_career_parser.py +++ b/tests/players/parsers/test_player_career_parser.py @@ -165,7 +165,7 @@ async def test_player_career_parser_parsing_error_attribute_error( assert ( error.value.message - == "AttributeError(\"'NoneType' object has no attribute 'find'\")" + == "AttributeError(\"'NoneType' object has no attribute 'css_first'\")" ) @@ -201,38 +201,3 @@ async def test_player_career_parser_parsing_error_key_error( await player_career_parser.parse() assert error.value.message == "KeyError('src')" - - -@pytest.mark.parametrize( - ("player_career_parser", "player_html_data"), - [("TeKrop-2217", "TeKrop-2217")], - indirect=["player_career_parser", "player_html_data"], -) -@pytest.mark.asyncio -async def test_player_career_parser_parsing_error_type_error( - player_career_parser: PlayerCareerParser, - player_html_data: str, - player_search_response_mock: Mock, -): - player_type_error = player_html_data.replace( - 'class="Profile-playerSummary--endorsement"', - "", - ) - - with ( - patch( - "httpx.AsyncClient.get", - side_effect=[ - # Players search call first - player_search_response_mock, - # Player profile page - Mock(status_code=status.HTTP_200_OK, text=player_type_error), - ], - ), - pytest.raises(ParserParsingError) as error, - ): - await player_career_parser.parse() - - assert ( - error.value.message == "TypeError(\"'NoneType' object is not subscriptable\")" - ) diff --git a/tests/players/test_players_helpers.py b/tests/players/test_players_helpers.py index ad97894c..6555745b 100644 --- a/tests/players/test_players_helpers.py +++ b/tests/players/test_players_helpers.py @@ -167,8 +167,8 @@ def test_get_role_key_from_icon(icon_url: str, role: CompetitiveRole): @pytest.mark.parametrize( ("hero_classes", "result"), [ - (["stats-container", "option-0", "is-active"], "option-0"), - (["stats-container", "option-1"], "option-1"), + ("stats-container option-0 is-active", "option-0"), + ("stats-container option-1", "option-1"), ], ) def test_get_stats_hero_class(hero_classes: list[str], result: str): diff --git a/uv.lock b/uv.lock index bfdbf18a..86ec2b22 100644 --- a/uv.lock +++ b/uv.lock @@ -39,18 +39,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl", hash = "sha256:051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24", size = 27764 }, ] -[[package]] -name = "beautifulsoup4" -version = "4.12.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "soupsieve" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b3/ca/824b1195773ce6166d388573fc106ce56d4a805bd7427b624e063596ec58/beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051", size = 581181 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b1/fe/e8c672695b37eecc5cbf43e1d0638d88d66ba3a44c4d321c796f4e59167f/beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed", size = 147925 }, -] - [[package]] name = "certifi" version = "2024.7.4" @@ -610,7 +598,6 @@ name = "overfast-api" version = "3.2.1" source = { virtual = "." } dependencies = [ - { name = "beautifulsoup4" }, { name = "fastapi", extra = ["standard"] }, { name = "httpx", extra = ["http2"] }, { name = "loguru" }, @@ -618,6 +605,7 @@ dependencies = [ { name = "pydantic" }, { name = "pydantic-settings" }, { name = "redis" }, + { name = "selectolax" }, ] [package.dependency-groups] @@ -638,7 +626,6 @@ dev = [ [package.metadata] requires-dist = [ - { name = "beautifulsoup4", specifier = "==4.12.*" }, { name = "fastapi", extras = ["standard"], specifier = "==0.115.*" }, { name = "httpx", extras = ["http2"], specifier = "==0.27.*" }, { name = "loguru", specifier = "==0.7.*" }, @@ -646,6 +633,7 @@ requires-dist = [ { name = "pydantic", specifier = "==2.9.*" }, { name = "pydantic-settings", specifier = "==2.6.*" }, { name = "redis", specifier = "==5.2.*" }, + { name = "selectolax", specifier = "==0.3.*" }, ] [package.metadata.dependency-groups] @@ -1013,6 +1001,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/79/7b/884553415e9f0a9bf358ed52fb68b934e67ef6c5a62397ace924a1afdf9a/ruff-0.7.1-py3-none-win_arm64.whl", hash = "sha256:19aa200ec824c0f36d0c9114c8ec0087082021732979a359d6f3c390a6ff2a37", size = 8717402 }, ] +[[package]] +name = "selectolax" +version = "0.3.25" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/f2/e498a6b3723da1ed1bda39165f9ef436d2f6eaf39435ed3145bccdab2a98/selectolax-0.3.25.tar.gz", hash = "sha256:55aee394fe9d69c81d2c6dd246fc21a822aa8d030e3d0dc1d92f2e8fc68b0f5a", size = 3610575 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/45/2c/cce2bcc9b1455a2472af072e7be43714269bec8be43ff576bd01b9dd8ce5/selectolax-0.3.25-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:5aede75c36d30e0a9bfff83e125c092c393b1a4c037f7110563e5c88d0b5dc3a", size = 5850901 }, + { url = "https://files.pythonhosted.org/packages/a1/9c/a8edbb57a4a2c73e56142fbc896c7f9de1ff527f9609f9dd620afaf5362a/selectolax-0.3.25-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:92db0b937378d739e005042ac00b391e9617e2b5dd64b296e6a435a1080f1283", size = 3162812 }, + { url = "https://files.pythonhosted.org/packages/3d/f1/065c06c7b3d0de0a5dcaa13b0b184c8e8d728a950a2e15ffda9bfd5e17e4/selectolax-0.3.25-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:957b45f269b917d313a36619443edfea5a57fee608856a1be6599b6f7ce267c8", size = 7656632 }, + { url = "https://files.pythonhosted.org/packages/fa/de/f18bef7159fcc9108cc7cc9a7d7267cc497e6c27a633f16424d00a3a7e13/selectolax-0.3.25-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0343ec3dc01ced50a431d179d1fc95ed2fe4bcc9face188f87188e0ce1b2950", size = 7698864 }, + { url = "https://files.pythonhosted.org/packages/5b/30/8a7eccc04950b2de9cbe83053efbde3fe3d616dc221493491f8a3c69311c/selectolax-0.3.25-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:944f02184690f06d78a716ec4495f932303c1f49bd01a6c6bc05cd1f22469fa5", size = 7084492 }, + { url = "https://files.pythonhosted.org/packages/57/6a/883d1d000d15707f4ac079e4446a19ccb3ea88f55ee0eebff310c17250d6/selectolax-0.3.25-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c95b8a1d51bd7bfb78fdccb06b70110b4da86c9175058f9918de5d4c7a652ad7", size = 7495514 }, + { url = "https://files.pythonhosted.org/packages/45/fe/17f6cef5a1267ade1a20607bb4d173704e3bbbe9a0e7088f130eee28d4d4/selectolax-0.3.25-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:1940a551491ac0df39adc02edb17399badbc38d6a76e67e30007ed1b9379a921", size = 7086332 }, + { url = "https://files.pythonhosted.org/packages/84/e4/390220deb87d057c4e87d0cd84187db798b50a5251fe300b38a070dcc911/selectolax-0.3.25-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3e6babbd266db0cb720bb60e9c936c8fe2e6b44401acaf0c7d41c06bee4f7289", size = 7604787 }, + { url = "https://files.pythonhosted.org/packages/c3/03/318083e04d2efe3840a85c11e60231ae213f7a31b9ae51c807f72ad2d92c/selectolax-0.3.25-cp312-cp312-win32.whl", hash = "sha256:50534cd3f49fa88b1fccb98dd56ef3cbbdca04a4fd57744e698f03879e563e3b", size = 2335417 }, + { url = "https://files.pythonhosted.org/packages/de/87/c278f20a6425ba3a4f4186806af8e7bdcb679efe8fa3a24c3b27281f274d/selectolax-0.3.25-cp312-cp312-win_amd64.whl", hash = "sha256:e0b7b66be57d644b978a696cb133275d8d53d83a59a62567aacfeb121416171d", size = 2483965 }, + { url = "https://files.pythonhosted.org/packages/1a/4d/b83484798cb1ada24c0956e456ddbd510e20324eedb6991ec47c28618cf4/selectolax-0.3.25-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d0ac08f8e9eee4a33cfe8080cc52f6dbf73f640e95ca24c36e55ef50dd0be759", size = 5848591 }, + { url = "https://files.pythonhosted.org/packages/13/1b/2ecf54dddae8e517bcef20db627f177ed2cfa70f76f161cca58b0c9a0e29/selectolax-0.3.25-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d4bba7a6dfbd0c5b7eec2a1ab5423b3d3d013a7d12fba52275c5e71d9debc0dd", size = 3160818 }, + { url = "https://files.pythonhosted.org/packages/a9/12/51a76c6bd97194340f8c41799746bcb78b81fa39bf04fae1d5c7cf36c7fb/selectolax-0.3.25-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c692d008a3998fd7f8e0574e1483893aa9fc4713594dbd4b055c8d65c59cebe9", size = 7628568 }, + { url = "https://files.pythonhosted.org/packages/87/76/c17d6a8fa64b35a2e7b932b2ebe32735b25012c4b3951b38b536a72261e6/selectolax-0.3.25-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be987250e7f87ef021b390ea158e34f0fff343e445440b7810d33bc4d2b58d44", size = 7670508 }, + { url = "https://files.pythonhosted.org/packages/4f/91/cebefcb56ea17603292783f2b8dd66681833510fbb48c7073d55d94d55d5/selectolax-0.3.25-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2957587198ee3909783f5bcf8ded4f2254de1297dcb96fb623661cfe8d79734c", size = 7057731 }, + { url = "https://files.pythonhosted.org/packages/a1/a1/dbf4a29afb3bee21e0057dc76856825e5790c5d875f4cfe7b46b65d3fa4c/selectolax-0.3.25-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2846a1ecd4733db7c71f973370d639c6ca104390252241db3f94f69f16aaa9b8", size = 7440039 }, + { url = "https://files.pythonhosted.org/packages/e8/85/4f63f207d3943f3495994f0fecb8c6282c0146568293cdaa88ff4c70a570/selectolax-0.3.25-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d56ebb0ffbf3680290a65aaeb57059533669f59002dcaa77c25688559cd6d8d6", size = 7011379 }, + { url = "https://files.pythonhosted.org/packages/c6/ca/8c4719701fa3d9b9b7b6d47dc92e2f757d2e6cca156c9eba289b36be878e/selectolax-0.3.25-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:48895dd8bb664e534b1955cf8bc32574ba96566cd92fe143da4cdf64c2f91756", size = 7542571 }, + { url = "https://files.pythonhosted.org/packages/01/e3/f9bd51b501b9506f338e6dab051cf28cbea3548ca42f9a2ceda604474dd7/selectolax-0.3.25-cp313-cp313-win32.whl", hash = "sha256:2cf7920130d87243114194b41a343aa7970f7efd830e60bad8199cfc7d48381b", size = 2335018 }, + { url = "https://files.pythonhosted.org/packages/0e/a7/6e1d8110c8750cf3a0c5346e95add856fe8a927c0bc044840f3ab30bc7a8/selectolax-0.3.25-cp313-cp313-win_amd64.whl", hash = "sha256:b83a3580a3af4d0a94f9c01d2164ecb5f56925630e2914cddf0f1b8873bbf9f9", size = 2484258 }, +] + [[package]] name = "shellingham" version = "1.5.4" @@ -1049,15 +1065,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575 }, ] -[[package]] -name = "soupsieve" -version = "2.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d7/ce/fbaeed4f9fb8b2daa961f90591662df6a86c1abf25c548329a86920aedfb/soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb", size = 101569 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/c2/fe97d779f3ef3b15f05c94a2f1e3d21732574ed441687474db9d342a7315/soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9", size = 36186 }, -] - [[package]] name = "stack-data" version = "0.6.3"