Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: CI

on:
push:
branches: [main]
pull_request:
branches: [main]

jobs:
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.10", "3.11", "3.12"]

steps:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
python -m pip install -r requirements-dev.txt

- name: Ruff
run: ruff check .

- name: Black
run: black --check .

- name: Pytest
run: pytest -v
5 changes: 4 additions & 1 deletion examples/scrape_news.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def build_client() -> ThordataClient:
# CLI parsing
# -------------------------------------------------------------


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Google News scraper using Thordata SERP API (engine=google_news)",
Expand Down Expand Up @@ -172,6 +173,7 @@ def parse_args() -> argparse.Namespace:
# SERP call: engine=google_news
# -------------------------------------------------------------


def search_google_news(
client: ThordataClient,
query: str,
Expand Down Expand Up @@ -260,6 +262,7 @@ def search_google_news(
# Main
# -------------------------------------------------------------


def main() -> None:
args = parse_args()
client = build_client()
Expand Down Expand Up @@ -323,4 +326,4 @@ def main() -> None:


if __name__ == "__main__":
main()
main()
4 changes: 4 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
pytest>=8.0.0
pytest-httpserver>=1.1.0
ruff>=0.1.0
black>=23.0.0
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
thordata-sdk>=0.3.0
thordata-sdk>=0.5.0
python-dotenv>=1.0.0
pandas>=2.1.0
Binary file not shown.
76 changes: 76 additions & 0 deletions test/test_examples_scrape_news_offline_e2e.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import json
import os
import subprocess
import sys
from pathlib import Path
from urllib.parse import parse_qs

from pytest_httpserver import HTTPServer
from werkzeug.wrappers import Request, Response


def test_scrape_news_runs_offline(httpserver: HTTPServer, tmp_path: Path) -> None:
engines_seen: list[str] = []

def handler(request: Request) -> Response:
body = request.get_data(as_text=True) or ""
form = parse_qs(body)
engine = (form.get("engine") or [""])[0]
engines_seen.append(engine)

payload = {
"code": 200,
"news_results": [
{
"title": "Example News",
"link": "https://example.com",
"source": "ExampleSource",
"snippet": "Example snippet",
"date": "Today",
}
],
"organic": [],
}
return Response(
json.dumps(payload), status=200, content_type="application/json"
)

httpserver.expect_request("/request", method="POST").respond_with_handler(handler)
base_url = httpserver.url_for("/").rstrip("/").replace("localhost", "127.0.0.1")

outfile = tmp_path / "news.csv"

env = os.environ.copy()
env["THORDATA_SCRAPER_TOKEN"] = "dummy"
env["THORDATA_SCRAPERAPI_BASE_URL"] = base_url
env["PYTHONIOENCODING"] = "utf-8"
env["PYTHONUTF8"] = "1"
env["NO_PROXY"] = "127.0.0.1,localhost"
env["no_proxy"] = env["NO_PROXY"]

result = subprocess.run(
[
sys.executable,
"examples/scrape_news.py",
"--query",
"pizza",
"--num",
"1",
"--gl",
"us",
"--hl",
"en",
"--outfile",
str(outfile),
],
env=env,
capture_output=True,
text=True,
encoding="utf-8",
errors="replace",
timeout=60,
)

assert result.returncode == 0, (result.stdout or "") + "\n" + (result.stderr or "")
assert "google_news" in engines_seen
assert outfile.exists()