TLAiBench Python #8

Workflow file for this run

.github/workflows/translate.yml at a23a530

	name: TLAiBench Python

	on:
	workflow_dispatch:
	inputs:
	model:
	description: 'Model to use'
	required: true
	type: choice
	default: 'github/gpt-4.1'
	options:
	- 'github/gpt-4.1'
	- 'github/Llama-3.2-11B-Vision-Instruct'
	- 'bedrock/anthropic.claude-sonnet-4-20250514-v1:0'
	model_id:
	description: 'Model ID (required for AWS Bedrock)'
	required: false
	type: string
	default: ''
	puzzle_files:
	description: 'Puzzle files to process (space-separated)'
	required: false
	type: string
	default: '../puzzles/DieHard.md'
	debug:
	description: 'Enable debug logging'
	required: false
	type: boolean
	default: false

	permissions:
	models: read
	id-token: write # required for OIDC
	contents: read

	jobs:
	benchmarks:
	name: TLAiBench Python

	runs-on: ubuntu-latest
	defaults:
	run:
	shell: bash

	steps:

	- name: Clone repo
	uses: actions/checkout@v4
	with:
	## All history for git diff below to succeed.
	fetch-depth: 0

	- name: Configure AWS credentials (OIDC)
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: arn:aws:iam::024871859028:role/GitHubActionsRole
	aws-region: us-east-1

	## Provision VSCode
	##
	## Note: VSCode is being used to host the TLA+ MCP servers. This choice is a a workaround, as VSCode is fundamentally a desktop/UI application,
	## which introduces the complications below. I considered using code-server (https://github.com/coder/code-server), the headless version of VSCode,
	## as an alternative. However, I couldn't get it to automatically launch the TLA+ extension and its associated MCP servers. Another potential
	## approach would be to refactor the TLA+ VSCode extension to allow the MCP servers to be launched independently—either as standalone processes or
	## via GenAIScript.

	- name: Install VSCode
	run: \|
	sudo apt-get update
	sudo apt-get install --no-install-recommends -y apt-transport-https
	curl https://packages.microsoft.com/keys/microsoft.asc \| gpg --dearmor > microsoft.gpg
	sudo mv microsoft.gpg /etc/apt/trusted.gpg.d/microsoft.gpg
	echo "deb [arch=amd64] https://packages.microsoft.com/repos/vscode stable main" \| sudo tee /etc/apt/sources.list.d/vscode.list
	sudo apt-get update
	sudo apt-get install --no-install-recommends -y code xvfb

	- name: Launch TLA+ VSCode extension hosting our MCP servers
	run: \|
	## Install TLA+ VSCode extension (doesn't require X11).
	code --install-extension tlaplus.vscode-ide
	## Create X11 server to run VSCode.
	export XDG_RUNTIME_DIR=/run/user/$(id -u)
	export DBUS_SESSION_BUS_ADDRESS=unix:path=$XDG_RUNTIME_DIR/bus
	dbus-daemon --session --address=$DBUS_SESSION_BUS_ADDRESS --nofork --nopidfile --syslog-only &
	mkdir -p ~/.vscode && echo '{ "disable-hardware-acceleration": true }' > ~/.vscode/argv.json
	/usr/bin/Xvfb :99 -screen 0 1024x768x24 > /dev/null 2>&1 &
	## Setting tlaplus.mcp.port and disabling workspace trust are the magic sauce that activates the TLA+ MCP Servers at startup.
	mkdir -p ~/.vscode && echo '{ "tlaplus.mcp.port": 59071, "tlaplus.mcp.enableFilesystemTools": true }' > .vscode/settings.json
	DISPLAY=:99 code --no-sandbox --disable-gpu --disable-workspace-trust .
	## Allow vscode to come up before moving to ps and nc that validate that the MCP servers are up and running.
	sleep 15
	ps axu\|grep vscode \|\| true
	nc -vz localhost 59071 \|\| echo "MCP server not yet ready"

	## Provision TLA+ Tools
	##

	- name: Get (nightly) TLC
	run: wget https://nightly.tlapl.us/dist/tla2tools.jar

	- name: Get (nightly) CommunityModules
	run: wget https://github.com/tlaplus/CommunityModules/releases/latest/download/CommunityModules-deps.jar

	## Provision Python Environment
	##

	- name: Setup Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.13'

	- name: Install Python dependencies
	run: \|
	cd python
	pip install -r requirements.txt

	- name: Test MCP connection
	run: \|
	cd python
	python translate.py --test-mcp
	env:
	GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }}

	## Run TLAi benchmarks
	##

	- name: Run Python translate script on the logic puzzles
	run: \|
	cd python

	# Build command arguments
	ARGS="--model ${{ github.event.inputs.model }}"

	# Add model_id if provided (for AWS Bedrock)
	if [ -n "${{ github.event.inputs.model_id }}" ]; then
	ARGS="$ARGS --model-id ${{ github.event.inputs.model_id }}"
	fi

	# Add debug flag if enabled
	if [ "${{ github.event.inputs.debug }}" = "true" ]; then
	ARGS="$ARGS --debug"
	fi

	# Add puzzle files
	ARGS="$ARGS ${{ github.event.inputs.puzzle_files }}"

	echo "Running: python translate.py $ARGS"
	python translate.py $ARGS
	env:
	GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }}

	## Postprocess the TLAi benchmarks
	##

	- name: List generated files
	if: always()
	run: \|
	echo "=== Generated TLA+ files ==="
	ls -la .tla .cfg 2>/dev/null \|\| echo "No TLA+ files generated"

	echo "=== Python logs ==="
	# Show any log files if they exist
	find . -name "*.log" -exec echo "=== {} ===" \; -exec cat {} \; 2>/dev/null \|\| true

	- name: Generate timestamp for artifact
	id: timestamp
	run: \|
	timestamp=$(date +%Y%m%d-%H%M%S)
	echo "timestamp=${timestamp}" >> $GITHUB_OUTPUT

	- name: Archive generated TLA+ specs (if any)
	if: always()
	uses: actions/upload-artifact@v4
	with:
	include-hidden-files: true
	name: TLAiBenchResults-Python-${{ steps.timestamp.outputs.timestamp }}
	path: \|
	*.tla
	*.cfg
	python/*.log
	!gold/*.tla
	!gold/*.cfg

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

TLAiBench Python #8

Workflow file

TLAiBench Python #8

Uh oh!

Workflow file for this run