Skip to content

Commit

Permalink
update stop vllm to use ilab/ilab and rename best_model
Browse files Browse the repository at this point in the history
Signed-off-by: sallyom <[email protected]>
  • Loading branch information
sallyom committed Oct 18, 2024
1 parent 1d48bfb commit 9d45526
Show file tree
Hide file tree
Showing 5 changed files with 348 additions and 338 deletions.
95 changes: 49 additions & 46 deletions eval/final/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,29 +32,35 @@ def run_final_eval_op(
):
import json
import os
import subprocess

import torch
from instructlab.eval.mmlu import MMLU_TASKS, MMLUBranchEvaluator
from instructlab.eval.mt_bench import MTBenchBranchEvaluator
from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score

VLLM_SERVER = "http://localhost:8000/v1"

print("Starting Final Eval...")

def launch_vllm(
model_path: str, gpu_count: int, retries: int = 120, delay: int = 10
):
) -> tuple:
import subprocess
import sys
import time

import requests
from instructlab.model.backends.common import free_tcp_ipv4_port

free_port = free_tcp_ipv4_port("127.0.0.1")
port = str(free_port)
vllm_server = f"http://127.0.0.1:{port}/v1"

command = [
sys.executable,
"-m",
"vllm.entrypoints.openai.api_server",
"--port",
port,
"--model",
model_path,
]
Expand All @@ -64,16 +70,16 @@ def launch_vllm(
str(gpu_count),
]

subprocess.Popen(args=command)
process = subprocess.Popen(args=command)

print(f"Waiting for vLLM server to start at {VLLM_SERVER}...")
print(f"Waiting for vLLM server to start at {vllm_server}...")

for attempt in range(retries):
try:
response = requests.get(f"{VLLM_SERVER}/models")
response = requests.get(f"{vllm_server}/models")
if response.status_code == 200:
print(f"vLLM server is up and running at {VLLM_SERVER}.")
return
print(f"vLLM server is up and running at {vllm_server}.")
return process, vllm_server
except requests.ConnectionError:
pass

Expand All @@ -83,41 +89,38 @@ def launch_vllm(
time.sleep(delay)

raise RuntimeError(
f"Failed to start vLLM server at {VLLM_SERVER} after {retries} retries."
f"Failed to start vLLM server at {vllm_server} after {retries} retries."
)

# This seems like excessive effort to stop the vllm process, but merely saving & killing the pid doesn't work
# Also, the base image does not include 'pkill' cmd, so can't pkill -f vllm.entrypoints.openai.api_server either
def stop_vllm():
import psutil
def shutdown_vllm(process: subprocess.Popen, timeout: int = 20):
import subprocess

for process in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
cmdline = process.info.get("cmdline")
if cmdline and "vllm.entrypoints.openai.api_server" in cmdline:
print(
f"Found vLLM server process with PID: {process.info['pid']}, terminating..."
)
try:
process.terminate() # Try graceful termination
process.wait(timeout=5) # Wait a bit for it to terminate
if process.is_running():
print(
f"Forcefully killing vLLM server process with PID: {process.info['pid']}"
)
process.kill() # Force kill if it's still running
print(
f"Successfully stopped vLLM server with PID: {process.info['pid']}"
)
except psutil.NoSuchProcess:
print(f"Process with PID {process.info['pid']} no longer exists.")
except psutil.AccessDenied:
print(
f"Access denied when trying to terminate process with PID {process.info['pid']}."
)
except Exception as e:
print(
f"Failed to terminate process with PID {process.info['pid']}. Error: {e}"
)
from instructlab.model.backends.vllm import wait_for_stable_vram

try:
process.terminate()
process.wait(timeout=timeout)

if process.poll() is None:
print(f"Forcefully killing vLLM server process with PID: {process.pid}")
process.kill()

print(f"Successfully stopped vLLM server with PID: {process.pid}")

except subprocess.TimeoutExpired:
print(
f"Timeout expired. Forcefully killing vLLM server with PID: {process.pid}"
)
process.kill() # Force kill the process if over timeout
except subprocess.NoSuchProcess:
print(f"Process with PID {process.pid} no longer exists.")
except Exception as e:
print(f"Failed to stop process with PID {process.pid}. Error: {e}")
# Note from instructlab/model/backends/vllm.py
# vLLM relies on stable VRAM, residual reclamation activity
# can lead to crashes on restart. To prevent this add a
# short delay (typically ~ 10 seconds, max 30) to verify stability.
wait_for_stable_vram(30)

# For standalone mode
if candidate_model is None:
Expand Down Expand Up @@ -279,12 +282,12 @@ def find_node_dataset_directories(base_dir: str):
for i, evaluator in enumerate(mmlu_branch_evaluators):
m_path = m_paths[i]
print("Launching Vllm...")
launch_vllm(m_path, gpu_count)
overall_score, individual_scores = evaluator.run(VLLM_SERVER)
vllm_process, vllm_server = launch_vllm(m_path, gpu_count)
overall_score, individual_scores = evaluator.run(vllm_server)
overall_scores.append(overall_score)
individual_scores_list.append(individual_scores)
print("Stopping Vllm")
stop_vllm()
shutdown_vllm(vllm_process)

# TODO: update instructlab/instructlab model/evaluate.py
# so this logic can be imported outside of the CLI
Expand Down Expand Up @@ -386,15 +389,15 @@ def find_node_dataset_directories(base_dir: str):
print(
f"Generating questions and reference answers from qna files for branch {branch}..."
)
launch_vllm(m_path, gpu_count)
vllm_process, vllm_server = launch_vllm(m_path, gpu_count)

evaluator.gen_answers(
server_url=VLLM_SERVER,
server_url=vllm_server,
serving_gpus=gpu_count,
max_workers=max_workers,
)

stop_vllm()
shutdown_vllm(vllm_process)

print(f"Evaluating answers for branch {branch}...")
overall_score, qa_pairs, error_rate = evaluator.judge_answers(
Expand Down
91 changes: 47 additions & 44 deletions eval/mt_bench/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,31 @@ def run_mt_bench_op(
) -> NamedTuple("outputs", best_model=str, best_score=float):
import json
import os
import subprocess

import torch
from instructlab.eval.mt_bench import MTBenchEvaluator

VLLM_SERVER = "http://localhost:8000/v1"

def launch_vllm(
model_path: str, gpu_count: int, retries: int = 120, delay: int = 10
):
) -> tuple:
import subprocess
import sys
import time

import requests
from instructlab.model.backends.common import free_tcp_ipv4_port

free_port = free_tcp_ipv4_port("127.0.0.1")
port = str(free_port)
vllm_server = f"http://127.0.0.1:{port}/v1"

command = [
sys.executable,
"-m",
"vllm.entrypoints.openai.api_server",
"--port",
port,
"--model",
model_path,
]
Expand All @@ -51,16 +57,16 @@ def launch_vllm(
str(gpu_count),
]

subprocess.Popen(args=command)
process = subprocess.Popen(args=command)

print(f"Waiting for vLLM server to start at {VLLM_SERVER}...")
print(f"Waiting for vLLM server to start at {vllm_server}...")

for attempt in range(retries):
try:
response = requests.get(f"{VLLM_SERVER}/models")
response = requests.get(f"{vllm_server}/models")
if response.status_code == 200:
print(f"vLLM server is up and running at {VLLM_SERVER}.")
return
print(f"vLLM server is up and running at {vllm_server}.")
return process, vllm_server
except requests.ConnectionError:
pass

Expand All @@ -70,41 +76,38 @@ def launch_vllm(
time.sleep(delay)

raise RuntimeError(
f"Failed to start vLLM server at {VLLM_SERVER} after {retries} retries."
f"Failed to start vLLM server at {vllm_server} after {retries} retries."
)

# This seems like excessive effort to stop the vllm process, but merely saving & killing the pid doesn't work
# Also, the base image does not include 'pkill' cmd, so can't pkill -f vllm.entrypoints.openai.api_server either
def stop_vllm():
import psutil

for process in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
cmdline = process.info.get("cmdline")
if cmdline and "vllm.entrypoints.openai.api_server" in cmdline:
print(
f"Found vLLM server process with PID: {process.info['pid']}, terminating..."
)
try:
process.terminate() # Try graceful termination
process.wait(timeout=5) # Wait a bit for it to terminate
if process.is_running():
print(
f"Forcefully killing vLLM server process with PID: {process.info['pid']}"
)
process.kill() # Force kill if it's still running
print(
f"Successfully stopped vLLM server with PID: {process.info['pid']}"
)
except psutil.NoSuchProcess:
print(f"Process with PID {process.info['pid']} no longer exists.")
except psutil.AccessDenied:
print(
f"Access denied when trying to terminate process with PID {process.info['pid']}."
)
except Exception as e:
print(
f"Failed to terminate process with PID {process.info['pid']}. Error: {e}"
)
def shutdown_vllm(process: subprocess.Popen, timeout: int = 20):
import subprocess

from instructlab.model.backends.vllm import wait_for_stable_vram

try:
process.terminate()
process.wait(timeout=timeout)

if process.poll() is None:
print(f"Forcefully killing vLLM server process with PID: {process.pid}")
process.kill()

print(f"Successfully stopped vLLM server with PID: {process.pid}")

except subprocess.TimeoutExpired:
print(
f"Timeout expired. Forcefully killing vLLM server with PID: {process.pid}"
)
process.kill() # Force kill the process if over timeout
except subprocess.NoSuchProcess:
print(f"Process with PID {process.pid} no longer exists.")
except Exception as e:
print(f"Failed to stop process with PID {process.pid}. Error: {e}")
# Note from instructlab/model/backends/vllm.py
# vLLM relies on stable VRAM, residual reclamation activity
# can lead to crashes on restart. To prevent this add a
# short delay (typically ~ 10 seconds, max 30) to verify stability.
wait_for_stable_vram(30)

gpu_available = torch.cuda.is_available()
gpu_name = (
Expand Down Expand Up @@ -140,7 +143,7 @@ def stop_vllm():
print(f"Serving candidate model: {model_name}")
model_path = f"{models_path_prefix}/{model_name}"

launch_vllm(model_path, gpu_count)
vllm_process, vllm_server = launch_vllm(model_path, gpu_count)

# model ID is the model_path value in vLLM
evaluator = MTBenchEvaluator(
Expand All @@ -151,12 +154,12 @@ def stop_vllm():
)

evaluator.gen_answers(
server_url=VLLM_SERVER,
server_url=vllm_server,
serving_gpus=gpu_count,
max_workers=max_workers,
)

stop_vllm()
shutdown_vllm(vllm_process)

overall_score, qa_pairs, turn_scores, error_rate = evaluator.judge_answers(
server_url=judge_endpoint,
Expand Down
7 changes: 5 additions & 2 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def pipeline(
use_secret_as_env(run_mt_bench_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"})

final_eval_task = run_final_eval_op(
candidate_model=run_mt_bench_task.outputs["best_model"],
candidate_model="/output/model/hf_format/candidate_model",
taxonomy=git_clone_task.outputs["taxonomy"],
tasks=sdg_task.outputs["sdg"],
# TODO: DO we need both candidate_branch and base_branch
Expand All @@ -334,7 +334,9 @@ def pipeline(
)

mount_pvc(
task=final_eval_task, pvc_name=model_pvc_task.output, mount_path="/model"
task=final_eval_task,
pvc_name=model_pvc_task.output,
mount_path="/data",
)

use_config_map_as_env(
Expand All @@ -345,6 +347,7 @@ def pipeline(

use_secret_as_env(final_eval_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"})

final_eval_task.after(run_mt_bench_task)
final_eval_task.set_accelerator_type("nvidia.com/gpu")
final_eval_task.set_accelerator_limit(1)

Expand Down
Loading

0 comments on commit 9d45526

Please sign in to comment.