Skip to content

Build data

Build data #595

Workflow file for this run

# On every push this script is executed
on:
workflow_dispatch:
schedule:
- cron: "0 3 * * *"
concurrency: data
name: Build data
jobs:
generate-matrix:
runs-on: ubuntu-latest
env:
UV_CACHE_DIR: /tmp/.uv-cache
steps:
- name: checkout
uses: actions/checkout@v4
- name: Set up uv
# Install latest uv version using the installer
run: curl -LsSf https://astral.sh/uv/install.sh | sh
- name: Restore uv cache
uses: actions/cache@v4
with:
path: /tmp/.uv-cache
key: uv-${{ runner.os }}-${{ hashFiles('uv.lock') }}
restore-keys: |
uv-${{ runner.os }}-${{ hashFiles('uv.lock') }}
uv-${{ runner.os }}
- name: Generate token
id: generate_token
uses: pypi-data/github-app-token@v1
with:
app_id: ${{ secrets.APP_ID }}
private_key: ${{ secrets.APP_PRIVATE_KEY }}
- env:
GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }}
run: |
uv run pypi-data print-git-urls > links/repositories.txt
uv run pypi-data print-git-urls --ssh-urls > links/repositories_ssh.txt
- uses: EndBug/add-and-commit@v9
with:
add: |
links/repositories.txt
links/repositories_ssh.txt
author_email: "41898282+github-actions[bot]@users.noreply.github.com"
author_name: "commit-bot"
message: "Add repository URLs"
push: true
fetch: true
pull: '--rebase --autostash'
- env:
GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }}
id: groups
run: |
mkdir output/
uv run pypi-data group-index-urls output/
echo "matrix=$(cat output/groups.json)" >> "$GITHUB_OUTPUT"
- uses: actions/upload-artifact@v4
with:
name: groups
path: output/groups/
retention-days: 1
- name: "Set current date as env variable"
run: |
echo "tag_name=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_OUTPUT
id: version
- name: Create Release
id: create-release
uses: shogo82148/actions-create-release@v1
with:
draft: true
release_name: ${{ steps.version.outputs.tag_name }}
tag_name: ${{ steps.version.outputs.tag_name }}
commitish: ${{ github.sha }}
- name: Release URL
run: echo ${{ steps.create-release.html_url }}
- name: Minimize uv cache
run: uv cache prune --ci
outputs:
matrix: ${{ steps.groups.outputs.matrix }}
upload_url: ${{ steps.create-release.outputs.upload_url }}
release_id: ${{ steps.create-release.outputs.id }}
combine:
needs: [generate-matrix]
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
index: ${{fromJson(needs.generate-matrix.outputs.matrix)}}
steps:
- name: Maximize build space
uses: easimon/maximize-build-space@master
with:
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
remove-docker-images: 'true'
- name: checkout
uses: actions/checkout@v4
- uses: actions/download-artifact@v4
with:
name: groups
- uses: actions-rust-lang/setup-rust-toolchain@v1
with:
cache-on-failure: 'false'
- name: Install parquet cli from crates.io
uses: baptiste0928/cargo-install@v3
with:
crate: parquet
features: cli
- name: Download links
run: cat ${{ matrix.index }} | jq -rc '.[]'
- name: Debug
run: |
echo "Links for ${{ matrix.index }}"
cat ${{ matrix.index }} | jq -rc '.[] | [.name, .id] | @tsv'
- name: Download
run: |
mkdir input/
cat ${{ matrix.index }} | jq -rc '.[] | [.name, .id] | @tsv' | parallel --colsep '\t' wget --tries=2 -nv -O input/{2}.parquet {1}
- run: ls -la ${{ github.workspace }}/input/
- name: Combine
run: parquet-concat ${{ github.workspace }}/merged.parquet ${{ github.workspace }}/input/*.parquet
- name: Merged size
run: du -hs ${{ github.workspace }}/merged.parquet
- name: Rewrite
run: |
parquet-rewrite \
--compression=zstd \
--writer-version=2.0 \
--dictionary-enabled=true \
--statistics-enabled=page \
--bloom-filter-enabled=true \
--bloom-filter-fpp 0.2 \
--input=${{ github.workspace }}/merged.parquet \
--output=${{ github.workspace }}/output.parquet
- name: Output size
run: du -hs ${{ github.workspace }}/output.parquet
- name: Upload Assets
uses: shogo82148/actions-upload-release-asset@v1
with:
upload_url: ${{ needs.generate-matrix.outputs.upload_url }}
asset_path: ${{ github.workspace }}/output.parquet
asset_name: index-${{ matrix.index }}.parquet
makepublic:
needs: [generate-matrix, combine]
runs-on: ubuntu-latest
steps:
- name: checkout
uses: actions/checkout@v4
- name: Publish release
uses: StuYarrow/[email protected]
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
id: ${{ needs.generate-matrix.outputs.release_id }}
- name: Get download links
id: get_download_links
uses: actions/github-script@v7
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const response = await github.rest.repos.listReleaseAssets({
owner: context.repo.owner,
repo: context.repo.repo,
release_id: ${{ needs.generate-matrix.outputs.release_id }},
});
const assets = response.data;
// Save the download links to a file
require('fs').writeFileSync('links/dataset.txt', assets.map(asset => asset.browser_download_url).join('\n'));
- run: cat links/dataset.txt
- uses: EndBug/add-and-commit@v9
with:
add: 'links/dataset.txt'
author_email: "41898282+github-actions[bot]@users.noreply.github.com"
author_name: "commit-bot"
message: "Add download links for asset ${{ needs.generate-matrix.outputs.release_id }}"
push: true
fetch: true
pull: '--rebase --autostash'