Warning
The Autoscaler is currently in Beta and may experience changes, quirks, and downtime.
The https://run.vast.ai/route/
endpoint retrieves a worker to process a request.
endpoint
: The name of the endpoint group to use.api_key
: The api_key
from Vast.ai associated with the account that controls the endpoint group.cost
: The estimated compute resources for the request.url
: The address of the worker instance to send the request to.reqnum
: The request number corresponding to this worker instance. Note that workers expect to receive requests in approximately the same order as these reqnums
, but some flexibility is allowed due to potential out-of-order requests caused by concurrency or small delays on the proxy server.signature
: The signature signed by the private key corresponding to the public key associated with the autoscaler, available at https://run.vast.ai/pubkey/
.endpoint
: Same as the input parameter to /route/
.cost
: Same as the input parameter to /route/
.endpoint
: Same as the input parameter to /route/
.status
: The breakdown of workers in your endpoint group by status.1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import requests
import json
max_tokens = 256
route_payload = {
"endpoint": ENDPOINT_NAME,
"api_key": API_KEY,
"cost": max_tokens
}
response = requests.post(
"https://run.vast.ai/route/",
headers={"Content-Type": "application/json"},
data=json.dumps(route_payload),
timeout=4
)
if response.status_code != 200:
print(f"Failed to get worker address, response.status_code: {response.status_code}")
return
message = response.json()
worker_address = message['url']
print(f"calling {worker_address}/generate")
generate_url = f"{worker_address}/generate"
generate_payload = message
generate_payload["inputs"] = prompt_input
generate_payload["parameters"] = {"max_new_tokens": max_tokens}
generate_response = requests.post(
generate_url,
headers={"Content-Type": "application/json"},
data=json.dumps(generate_payload),
stream=args.generate_stream
)
if generate_response.status_code != 200:
print(f"Failed to call /generate endpoint for {generate_url} {generate_response.status_code}")
return
print(f"Response from {generate_url}:", generate_response.text)
1
curl https://run.vast.ai/route/ \n-X POST \n-d '{"endpoint": ENDPOINT_NAME, "api_key": API_KEY, "cost": 256}' \n-H 'Content-Type: application/json'
1
2
3
4
5
6
7
{
"cost": 256.0,
"endpoint": "Llama-2-70b-chat",
"reqnum": 14395,
"signature": "NpSvPfSajCGwox3QUz1svL9eTVbsGNMWWdDUuxKX1552ADdYxAra2rb4vMgdOwUOmVFngAyIwNZXxjyThXyOZDAnpS+TzZC1LNtvhGmb/wpkBpTjsxkwOGrW0LSq1DIn0usBGhibBIWjFm9/FvGRJvOau9i75gh1+ErEE2H3iVPBtNERQcjMUbflbe0qIr0j2OPIFjiUOwvHooVccI+yX0HOeuUUPG3+8vu4Ek6N/Vmb91rz9VvlAnALImq89zn0U9bYU1wZ6gUI/JkNaz+QKaUpvvcr569jMOCtSmiap+BAQD5113AxexVfNIVDv/x60A3UQS7wrlC2t9qWaTuaJw==",
"url": "http://86.125.47.146:50698"
}
For a more complete test example, see the test_LLM.py
script here:
1
python3 test_LLM.py "https://run.vast.ai" API_KEY "Llama-2-70b-chat" "What is Deep Learning?" 1 --generate_stream