Skip to content

Commit

Permalink
feat: replaced Beautiful Soup by selectolax to enhance performance (#213
Browse files Browse the repository at this point in the history
)
  • Loading branch information
TeKrop authored Nov 9, 2024
1 parent 7e60c1d commit 9916b04
Show file tree
Hide file tree
Showing 16 changed files with 209 additions and 321 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
[![License: MIT](https://img.shields.io/github/license/TeKrop/overfast-api)](https://github.com/TeKrop/overfast-api/blob/master/LICENSE)
![Mockup OverFast API](https://files.tekrop.fr/overfast_api_logo_full_1000.png)

> OverFast API provides comprehensive data on Overwatch 2 heroes, game modes, maps, and player statistics by scraping Blizzard pages. Developed with the efficiency of **FastAPI** and **Beautiful Soup**, it leverages **nginx** as a reverse proxy and **Redis** for caching. Its tailored caching mechanism significantly reduces calls to Blizzard pages, ensuring swift and precise data delivery to users.
> OverFast API provides comprehensive data on Overwatch 2 heroes, game modes, maps, and player statistics by scraping Blizzard pages. Developed with the efficiency of **FastAPI** and **Selectolax**, it leverages **nginx** as a reverse proxy and **Redis** for caching. Its tailored caching mechanism significantly reduces calls to Blizzard pages, ensuring swift and precise data delivery to users.
## Table of contents
* [✨ Live instance](#-live-instance)
Expand Down
8 changes: 6 additions & 2 deletions app/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,12 @@ def overfast_internal_error(url: str, error: Exception) -> HTTPException:
str(error),
)

# If Discord Webhook configuration is enabled, send a message to the
# given channel using Discord Webhook URL
# If we're using a profiler, it means we're debugging, raise the error
# directly in order to have proper backtrace in logs
if settings.profiler:
raise error

# Else, send a message to the given channel using Discord Webhook URL
send_discord_webhook_message(
f"* **URL** : {url}\n"
f"* **Error type** : {type(error).__name__}\n"
Expand Down
112 changes: 45 additions & 67 deletions app/heroes/parsers/hero_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import re
from typing import ClassVar

from bs4 import Tag
from fastapi import status
from selectolax.lexbor import LexborNode

from app.config import settings
from app.enums import Locale
Expand Down Expand Up @@ -34,40 +34,37 @@ def get_blizzard_url(self, **kwargs) -> str:
def parse_data(self) -> dict:
# We must check if we have the expected section for hero. If not,
# it means the hero hasn't been found and/or released yet.
if not self.root_tag.find("div", class_="abilities-container", recursive=False):
if not (
abilities_section := self.root_tag.css_first("div.abilities-container")
):
raise ParserBlizzardError(
status_code=status.HTTP_404_NOT_FOUND,
message="Hero not found or not released yet",
)

overview_section = self.root_tag.find("blz-page-header", recursive=False)
abilities_section = self.root_tag.find(
"div",
class_="abilities-container",
recursive=False,
)
lore_section = self.root_tag.find("blz-section", class_="lore", recursive=False)
overview_section = self.root_tag.css_first("blz-page-header")
lore_section = self.root_tag.css_first("blz-section.lore")

return {
**self.__get_summary(overview_section),
"abilities": self.__get_abilities(abilities_section),
"story": self.__get_story(lore_section),
}

def __get_summary(self, overview_section: Tag) -> dict:
header_section = overview_section.find("blz-header")
extra_list_items = overview_section.find("blz-list").find_all("blz-list-item")
def __get_summary(self, overview_section: LexborNode) -> dict:
header_section = overview_section.css_first("blz-header")
extra_list_items = overview_section.css_first("blz-list").css("blz-list-item")
birthday, age = self.__get_birthday_and_age(
text=extra_list_items[2].get_text(), locale=self.locale
text=extra_list_items[2].css_first("p").text(), locale=self.locale
)

return {
"name": header_section.find("h2").get_text(),
"description": (
header_section.find("p", slot="description").get_text().strip()
"name": header_section.css_first("h2").text(),
"description": header_section.css_first("p").text().strip(),
"role": get_role_from_icon_url(
extra_list_items[0].css_first("image").attributes.get("href")
),
"role": get_role_from_icon_url(extra_list_items[0].find("image")["href"]),
"location": extra_list_items[1].get_text().strip(),
"location": extra_list_items[1].text().strip(),
"birthday": birthday,
"age": age,
}
Expand Down Expand Up @@ -109,85 +106,75 @@ def __get_birthday_and_age(
return birthday, age

@staticmethod
def __get_abilities(abilities_section: Tag) -> list[dict]:
abilities_list_div = abilities_section.find(
"blz-carousel-section",
recursive=False,
).find("blz-carousel", recursive=False)
def __get_abilities(abilities_section: LexborNode) -> list[dict]:
carousel_section_div = abilities_section.css_first("blz-carousel-section")
abilities_list_div = carousel_section_div.css_first("blz-carousel")

abilities_desc = [
(
desc_div.find("blz-header")
.find("span")
.get_text()
desc_div.css_first("blz-header span")
.text()
.strip()
.replace("\r", "")
.replace("\n", " ")
)
for desc_div in abilities_list_div.find_all("blz-feature")
for desc_div in abilities_list_div.css("blz-feature")
]

abilities_videos = [
{
"thumbnail": video_div["poster"],
"thumbnail": video_div.attributes["poster"],
"link": {
"mp4": video_div["mp4"],
"webm": video_div["webm"],
"mp4": video_div.attributes["mp4"],
"webm": video_div.attributes["webm"],
},
}
for video_div in abilities_section.find(
"blz-carousel-section",
recursive=False,
).find_all("blz-video", recursive=False)
for video_div in carousel_section_div.css("blz-video")
]

return [
{
"name": ability_div["label"],
"name": ability_div.attributes["label"],
"description": abilities_desc[ability_index].strip(),
"icon": ability_div.find("blz-image")["src"],
"icon": ability_div.css_first("blz-image").attributes["src"],
"video": abilities_videos[ability_index],
}
for ability_index, ability_div in enumerate(
abilities_list_div.find("blz-tab-controls").find_all("blz-tab-control"),
abilities_list_div.css_first("blz-tab-controls").css("blz-tab-control"),
)
]

def __get_story(self, lore_section: Tag) -> dict:
showcase_section = lore_section.find("blz-showcase", recursive=False)
def __get_story(self, lore_section: LexborNode) -> dict:
showcase_section = lore_section.css_first("blz-showcase")

return {
"summary": (
showcase_section.find("blz-header")
.find("p")
.get_text()
showcase_section.css_first("blz-header p")
.text()
.strip()
.replace("\n", "")
),
"media": self.__get_media(showcase_section),
"chapters": self.__get_story_chapters(
lore_section.find("blz-accordion-section", recursive=False).find(
"blz-accordion",
recursive=False,
),
lore_section.css_first("blz-accordion-section blz-accordion")
),
}

def __get_media(self, showcase_section: Tag) -> dict | None:
if video := showcase_section.find("blz-video"):
def __get_media(self, showcase_section: LexborNode) -> dict | None:
if video := showcase_section.css_first("blz-video"):
return {
"type": MediaType.VIDEO,
"link": f"https://youtu.be/{video['youtube-id']}",
"link": f"https://youtu.be/{video.attributes['youtube-id']}",
}

if button := showcase_section.find("blz-button"):
if button := showcase_section.css_first("blz-button"):
return {
"type": (
MediaType.SHORT_STORY
if button["analytics-label"] == "short-story"
if button.attributes["analytics-label"] == "short-story"
else MediaType.COMIC
),
"link": self.__get_full_url(button["href"]),
"link": self.__get_full_url(button.attributes["href"]),
}

return None
Expand All @@ -199,33 +186,24 @@ def __get_full_url(url: str) -> str:
return f"{settings.blizzard_host}{url}" if url.startswith("/") else url

@staticmethod
def __get_story_chapters(accordion: Tag) -> list[dict]:
def __get_story_chapters(accordion: LexborNode) -> list[dict]:
chapters_content = [
(
" ".join(
[
paragraph.get_text()
for paragraph in content_container.find_all(["p", "pr"])
],
[paragraph.text() for paragraph in content_container.css("p,pr")],
).strip()
)
for content_container in accordion.find_all(
"div",
slot="content",
recursive=False,
)
for content_container in accordion.css("div[slot=content]")
]
chapters_picture = [
picture["src"] for picture in accordion.find_all("blz-image")
picture.attributes["src"] for picture in accordion.css("blz-image")
]

return [
{
"title": title_span.get_text().capitalize().strip(),
"title": title_span.text().capitalize().strip(),
"content": chapters_content[title_index],
"picture": chapters_picture[title_index],
}
for title_index, title_span in enumerate(
accordion.find_all("span", recursive=False),
)
for title_index, title_span in enumerate(accordion.css("span"))
]
12 changes: 5 additions & 7 deletions app/heroes/parsers/heroes_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,12 @@ def parse_data(self) -> list[dict]:
return sorted(
[
{
"key": hero["data-hero-id"],
"name": hero["hero-name"],
"portrait": hero.find("blz-image")["src"],
"role": hero["data-role"],
"key": hero.attributes["data-hero-id"],
"name": hero.attributes["hero-name"],
"portrait": hero.css_first("blz-image").attributes["src"],
"role": hero.attributes["data-role"],
}
for hero in self.root_tag.find("blz-media-gallery").find_all(
"blz-hero-card",
)
for hero in self.root_tag.css("blz-media-gallery blz-hero-card")
],
key=lambda hero: hero["key"],
)
Expand Down
2 changes: 1 addition & 1 deletion app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ async def lifespan(_: FastAPI): # pragma: no cover
app = FastAPI(title="OverFast API", docs_url=None, redoc_url=None, lifespan=lifespan)
description = f"""OverFast API provides comprehensive data on Overwatch 2 heroes,
game modes, maps, and player statistics by scraping Blizzard pages. Developed with
the efficiency of **FastAPI** and **Beautiful Soup**, it leverages **nginx** as a
the efficiency of **FastAPI** and **Selectolax**, it leverages **nginx** as a
reverse proxy and **Redis** for caching. Its tailored caching mechanism significantly
reduces calls to Blizzard pages, ensuring swift and precise data delivery to users.
Expand Down
21 changes: 6 additions & 15 deletions app/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from typing import ClassVar

import httpx
from bs4 import BeautifulSoup
from fastapi import status
from selectolax.lexbor import LexborHTMLParser

from .cache_manager import CacheManager
from .config import settings
Expand Down Expand Up @@ -134,24 +134,15 @@ def parse_response_data(self) -> None:


class HTMLParser(APIParser):
@property
def root_tag_params(self) -> dict:
"""Returns the BeautifulSoup params kwargs, used to find the root Tag
on the page which will be used for searching data.
"""
return {"name": "main", "class_": "main-content", "recursive": False}

def store_response_data(self, response: httpx.Response) -> None:
"""Initialize BeautifulSoup object with Blizzard response"""
self.create_bs_tag(response.text)
"""Initialize parser tag with Blizzard response"""
self.create_parser_tag(response.text)

def create_bs_tag(self, html_content: str) -> None:
self.root_tag = BeautifulSoup(html_content, "lxml").body.find(
**self.root_tag_params,
)
def create_parser_tag(self, html_content: str) -> None:
self.root_tag = LexborHTMLParser(html_content).css_first("main")


class JSONParser(APIParser):
def store_response_data(self, response: httpx.Response) -> None:
"""Initialize BeautifulSoup object with Blizzard response"""
"""Initialize object with Blizzard response"""
self.json_data = response.json()
10 changes: 6 additions & 4 deletions app/players/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,13 @@ def get_role_key_from_icon(icon_url: str) -> CompetitiveRole:
)


def get_stats_hero_class(hero_classes: list[str]) -> str:
def get_stats_hero_class(hero_class: str) -> str:
"""Extract the specific classname from the classes list for a given hero."""
return next(
classname for classname in hero_classes if classname.startswith("option-")
)
start_index = hero_class.find("option-")
end_index = start_index + len("option-")
while end_index < len(hero_class) and hero_class[end_index].isdigit():
end_index += 1
return hero_class[start_index:end_index]


def get_tier_from_icon(tier_url: str) -> int:
Expand Down
2 changes: 1 addition & 1 deletion app/players/parsers/base_player_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ async def parse(self) -> None:
== self.player_data["summary"]["lastUpdated"]
):
logger.info("Player Cache found and up-to-date, using it")
self.create_bs_tag(player_cache["profile"])
self.create_parser_tag(player_cache["profile"])
self.parse_response_data()
return

Expand Down
Loading

0 comments on commit 9916b04

Please sign in to comment.