From 2f81a2d4db480b61607871624c5c03269167625c Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Mon, 28 Oct 2024 22:54:57 +0000 Subject: [PATCH 01/27] first commit --- .../container/benchmark_serving.py | 50 ++++++++++++++----- .../container/requirements.txt | 1 + 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 37ecdb570..e2a28c579 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -20,6 +20,8 @@ import aiohttp import numpy as np +from sympy import symbols +from sympy.parsing.sympy_parser import parse_expr from transformers import AutoTokenizer from transformers import PreTrainedTokenizerBase @@ -96,18 +98,24 @@ def sample_requests( async def get_request( input_requests: List[Tuple[str, int, int]], - request_rate: float, + request_rate_expr: str, + start_time: float, ) -> AsyncGenerator[Tuple[str, int, int], None]: """Gets request async.""" - input_requests = iter(input_requests) for request in input_requests: yield request - if request_rate == float("inf"): + if request_rate_expr == float("inf"): # If the request rate is infinity, then we don't need to wait. continue + + # Evaluate the reqest rate at this point in time + t = symbols('t') + expr_parsed = parse_expr(request_rate_expr, transformations="all", local_dict={"t": t}) + request_rate_at_t = expr_parsed.subs(t, ((time.time_ns() - start_time) / 1000000000)) + # Sample the request interval from the exponential distribution. - interval = np.random.exponential(1.0 / request_rate) + interval = np.random.exponential(1.0 / request_rate_at_t) # The next request will be sent after the interval. await asyncio.sleep(interval) @@ -134,7 +142,7 @@ async def send_request( tokenizer: PreTrainedTokenizerBase, sax_model: str, model: str, -) -> Tuple[Tuple[int, int, float], Dict[str, int]]: +) -> Tuple[Optional[Tuple[int, int, float]], Optional[Dict[str, int]]]: """Sends request to server.""" request_start_time = time.time() errors = init_errors_map() @@ -291,9 +299,9 @@ async def benchmark( tokenizer, args.use_dummy_text, ) - benchmark_start_time = time.time() + benchmark_start_time = time.time_ns() tasks: List[asyncio.Task] = [] - async for request in get_request(input_requests, args.request_rate): + async for request in get_request(input_requests, args.request_rate, benchmark_start_time): prompt, prompt_len, output_len = request task = asyncio.create_task( send_request( @@ -321,7 +329,7 @@ async def benchmark( for err, count in errors.items(): combined_errors[err] = combined_errors[err] + count - benchmark_duration = time.time() - benchmark_start_time + benchmark_duration = (time.time_ns() - benchmark_start_time) / 1000000000 print_and_save_result(args, benchmark_duration, len(input_requests), model, combined_latencies, combined_errors) return combined_latencies, combined_errors @@ -599,6 +607,22 @@ async def main(args: argparse.Namespace): else args.endpoint ) + # Input assertions + def is_expression_of_t(expression): + # Check if expression uses variables other than 't' + try: + # Attempt to evaluate with only 't' defined + t = symbols('t') + expr_parsed = parse_expr(expression, transformations="all", local_dict={"t": t}) + expr_parsed.subs(t, 1) + return True + except KeyError as e: + # If another variable is required, it will throw a KeyError + return False + if not is_expression_of_t(args.request_rate): + raise ValueError(f"Request rate {args.request_rate}, must be an expression of `t`") + + print(f"Starting Prometheus Server on port {PROMETHEUS_PORT}") start_http_server(PROMETHEUS_PORT) @@ -607,8 +631,8 @@ async def main(args: argparse.Namespace): args.tokenizer, trust_remote_code=args.trust_remote_code ) - benchmark_start_time = time.time() - args.start_datetime = datetime.fromtimestamp(benchmark_start_time) + benchmark_start_time = time.time_ns() + args.start_datetime = datetime.fromtimestamp(benchmark_start_time / 1000000000) results = await asyncio.gather( *[benchmark(args, api_url, tokenizer, model) for model in models] @@ -629,7 +653,7 @@ async def main(args: argparse.Namespace): for k, v in errors.items(): combined_errors[k] = combined_errors[k] + v - benchmark_duration_all_models = time.time() - benchmark_start_time + benchmark_duration_all_models = (time.time_ns() - benchmark_start_time) / 1000000000 if args.save_aggregated_result: print_and_save_result(args, benchmark_duration_all_models, len(models)*args.num_prompts, f"ALL-{len(models)}-MODELS", combined_latencies, combined_errors) @@ -713,8 +737,8 @@ async def main(args: argparse.Namespace): ) parser.add_argument( "--request-rate", - type=float, - default=float("inf"), + type=str, + default="inf", help=( "Number of requests per second. If this is inf, " "then all the requests are sent at time 0. " diff --git a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt index a9f6d99a6..df46317a0 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt +++ b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt @@ -24,6 +24,7 @@ psutil ray >= 2.9 sentencepiece # Required for LLaMA tokenizer. numpy < 2.0 +sympy <= 1.13 torch == 2.1.1 transformers >= 4.42.0 # Required for Qwen2 xformers == 0.0.23 From 837554b859e266949d1d08caa0a5b6d033ac7149 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Mon, 28 Oct 2024 22:56:09 +0000 Subject: [PATCH 02/27] nit --- .../tools/profile-generator/container/benchmark_serving.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index e2a28c579..3602bf068 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -609,9 +609,8 @@ async def main(args: argparse.Namespace): # Input assertions def is_expression_of_t(expression): - # Check if expression uses variables other than 't' + # Check if expression uses variables other than 't' by attempting to evaluate with only 't' defined try: - # Attempt to evaluate with only 't' defined t = symbols('t') expr_parsed = parse_expr(expression, transformations="all", local_dict={"t": t}) expr_parsed.subs(t, 1) From fe980bf162dedfd4dd273df5f3ddf3d94bdf53c9 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Mon, 28 Oct 2024 22:58:04 +0000 Subject: [PATCH 03/27] ns to sec constant --- .../tools/profile-generator/container/benchmark_serving.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 3602bf068..3879dfc37 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -31,6 +31,7 @@ CLIENT_TIMEOUT_SEC = 3 * 60 * 60 NEW_TEXT_KEY = "\nOutput:\n" PROMETHEUS_PORT = 9090 +NS_IN_SEC = 1000 * 1000 * 1000 # Prometheus Metrics prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)]) @@ -329,7 +330,7 @@ async def benchmark( for err, count in errors.items(): combined_errors[err] = combined_errors[err] + count - benchmark_duration = (time.time_ns() - benchmark_start_time) / 1000000000 + benchmark_duration = (time.time_ns() - benchmark_start_time) / NS_IN_SEC print_and_save_result(args, benchmark_duration, len(input_requests), model, combined_latencies, combined_errors) return combined_latencies, combined_errors @@ -631,7 +632,7 @@ def is_expression_of_t(expression): ) benchmark_start_time = time.time_ns() - args.start_datetime = datetime.fromtimestamp(benchmark_start_time / 1000000000) + args.start_datetime = datetime.fromtimestamp(benchmark_start_time / NS_IN_SEC) results = await asyncio.gather( *[benchmark(args, api_url, tokenizer, model) for model in models] @@ -652,7 +653,7 @@ def is_expression_of_t(expression): for k, v in errors.items(): combined_errors[k] = combined_errors[k] + v - benchmark_duration_all_models = (time.time_ns() - benchmark_start_time) / 1000000000 + benchmark_duration_all_models = (time.time_ns() - benchmark_start_time) / NS_IN_SEC if args.save_aggregated_result: print_and_save_result(args, benchmark_duration_all_models, len(models)*args.num_prompts, f"ALL-{len(models)}-MODELS", combined_latencies, combined_errors) From 28dcf340d10464b36ddb2be4ee7668ad1a62204f Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Mon, 28 Oct 2024 23:02:24 +0000 Subject: [PATCH 04/27] properly handle infinity --- .../tools/profile-generator/container/benchmark_serving.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 3879dfc37..d312cf52b 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -619,7 +619,10 @@ def is_expression_of_t(expression): except KeyError as e: # If another variable is required, it will throw a KeyError return False - if not is_expression_of_t(args.request_rate): + + if args.request_rate == "inf": + args.request_rate = "oo" + if and not is_expression_of_t(args.request_rate): raise ValueError(f"Request rate {args.request_rate}, must be an expression of `t`") From 3715a2ace6028ea5ac8fc551f8eb88c23aedf4b8 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Mon, 28 Oct 2024 23:03:23 +0000 Subject: [PATCH 05/27] nit --- .../tools/profile-generator/container/benchmark_serving.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index d312cf52b..293addca0 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -106,7 +106,7 @@ async def get_request( for request in input_requests: yield request - if request_rate_expr == float("inf"): + if request_rate_expr == "oo": # If the request rate is infinity, then we don't need to wait. continue @@ -622,10 +622,9 @@ def is_expression_of_t(expression): if args.request_rate == "inf": args.request_rate = "oo" - if and not is_expression_of_t(args.request_rate): + if not is_expression_of_t(args.request_rate): raise ValueError(f"Request rate {args.request_rate}, must be an expression of `t`") - print(f"Starting Prometheus Server on port {PROMETHEUS_PORT}") start_http_server(PROMETHEUS_PORT) From dab6d67aaf10f94370f8158a324607e337f8ee9c Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 29 Oct 2024 16:15:18 +0000 Subject: [PATCH 06/27] ns in s --- .../tools/profile-generator/container/benchmark_serving.py | 4 ++-- manifest.yaml | 0 2 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 manifest.yaml diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 293addca0..e1981e8cc 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -31,7 +31,7 @@ CLIENT_TIMEOUT_SEC = 3 * 60 * 60 NEW_TEXT_KEY = "\nOutput:\n" PROMETHEUS_PORT = 9090 -NS_IN_SEC = 1000 * 1000 * 1000 +NS_IN_SEC = 1_000_000_000 # Prometheus Metrics prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)]) @@ -113,7 +113,7 @@ async def get_request( # Evaluate the reqest rate at this point in time t = symbols('t') expr_parsed = parse_expr(request_rate_expr, transformations="all", local_dict={"t": t}) - request_rate_at_t = expr_parsed.subs(t, ((time.time_ns() - start_time) / 1000000000)) + request_rate_at_t = expr_parsed.subs(t, ((time.time_ns() - start_time) / NS_IN_SEC)) # Sample the request interval from the exponential distribution. interval = np.random.exponential(1.0 / request_rate_at_t) diff --git a/manifest.yaml b/manifest.yaml new file mode 100644 index 000000000..e69de29bb From 4bcd688e15585a9d52230426b04fd18fb8ed564e Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 29 Oct 2024 16:15:41 +0000 Subject: [PATCH 07/27] remove manifest.yaml --- manifest.yaml | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 manifest.yaml diff --git a/manifest.yaml b/manifest.yaml deleted file mode 100644 index e69de29bb..000000000 From 68c3283455c83b23c3d3d5d9d93eb3b0ab0550e1 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 29 Oct 2024 16:16:29 +0000 Subject: [PATCH 08/27] better comment --- .../tools/profile-generator/container/benchmark_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index e1981e8cc..c77d7be97 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -617,7 +617,7 @@ def is_expression_of_t(expression): expr_parsed.subs(t, 1) return True except KeyError as e: - # If another variable is required, it will throw a KeyError + # If another variable is required, throw a KeyError return False if args.request_rate == "inf": From b631214b6f42d4e33bbc8f2d23b4b89e845a0e6f Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 29 Oct 2024 16:29:45 +0000 Subject: [PATCH 09/27] better flag message --- .../profile-generator/container/benchmark_serving.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index c77d7be97..4c2a6d764 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -742,11 +742,12 @@ def is_expression_of_t(expression): type=str, default="inf", help=( - "Number of requests per second. If this is inf, " - "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process to synthesize " - "the request arrival times." - ), + "Specifies the request rate as a function of time, f(t)." + " Example format: '1+1.05*t', where 't' represents seconds." + " If set to 'inf', all requests are sent at time 0. Otherwise," + " the function is interpreted to generate a Poisson process" + " for request arrival times based on the provided rate expression." + ), ) parser.add_argument("--seed", type=int, default=int(time.time())) parser.add_argument( From 0cb280855486aae3c29384d35c553547716cc718 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 29 Oct 2024 16:46:44 +0000 Subject: [PATCH 10/27] remove request rate from filename --- .../tools/profile-generator/container/benchmark_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 4c2a6d764..fbea2ea08 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -420,7 +420,7 @@ def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics # Save to file model_without_slash = model.replace("/","-") file_name = ( - f"{args.file_prefix}-{args.backend}-{args.request_rate}qps-{args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json" + f"{args.file_prefix}-{args.backend}-{args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json" ) with open(file_name, "w", encoding="utf-8") as outfile: json.dump(final_json, outfile) From 4e7cfada7b004fb0ea6b4bec4d3db8e67bf15e57 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 29 Oct 2024 16:49:13 +0000 Subject: [PATCH 11/27] tweak description --- .../profile-generator/container/benchmark_serving.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index fbea2ea08..c90979c95 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -743,10 +743,11 @@ def is_expression_of_t(expression): default="inf", help=( "Specifies the request rate as a function of time, f(t)." - " Example format: '1+1.05*t', where 't' represents seconds." - " If set to 'inf', all requests are sent at time 0. Otherwise," - " the function is interpreted to generate a Poisson process" - " for request arrival times based on the provided rate expression." + " Example format: '1+1.05*t', where 't' represents seconds from" + " start. If set to 'inf', all requests are sent at time 0." + " Otherwise, the function is interpreted to generate a Poisson" + " process for request arrival times based on the provided rate" + " expression." ), ) parser.add_argument("--seed", type=int, default=int(time.time())) From d10d86023d6e4eb8330737e7610c96afaea3777c Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 29 Oct 2024 16:54:34 +0000 Subject: [PATCH 12/27] typo --- .../tools/profile-generator/container/benchmark_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index c90979c95..a4d253b6c 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -110,7 +110,7 @@ async def get_request( # If the request rate is infinity, then we don't need to wait. continue - # Evaluate the reqest rate at this point in time + # Evaluate the request rate at this point in time t = symbols('t') expr_parsed = parse_expr(request_rate_expr, transformations="all", local_dict={"t": t}) request_rate_at_t = expr_parsed.subs(t, ((time.time_ns() - start_time) / NS_IN_SEC)) From 53744df402caafe9c7ec74dd9e39d7c75167fd4a Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Mon, 11 Nov 2024 22:44:05 +0000 Subject: [PATCH 13/27] refactoring --- .../container/benchmark_serving.py | 1201 +++++++++++------ 1 file changed, 793 insertions(+), 408 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index a4d253b6c..f7c08cc26 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -5,6 +5,7 @@ It currently supports TGI, vLLM, Triton TensorRT-LLM and Saxml. """ +from abc import ABC, abstractmethod import argparse import asyncio from datetime import datetime @@ -12,7 +13,8 @@ import random import requests import time -from typing import AsyncGenerator, List, Optional, Tuple, Dict +import os +from typing import AsyncGenerator, List, Optional, Tuple, Dict, TypedDict from prometheus_client import start_http_server, Histogram import google.auth @@ -25,8 +27,6 @@ from transformers import AutoTokenizer from transformers import PreTrainedTokenizerBase -from google.protobuf.timestamp_pb2 import Timestamp - MIN_SEQ_LEN = 4 CLIENT_TIMEOUT_SEC = 3 * 60 * 60 NEW_TEXT_KEY = "\nOutput:\n" @@ -37,73 +37,537 @@ prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)]) response_length_metric = Histogram("LatencyProfileGenerator:response_length", "Response length", buckets=[2**i for i in range(1, 16)]) tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request') + +class BenchmarkConfig(TypedDict): + model: str + model_server: str + start_time: float -def sample_requests( - dataset_path: str, - num_requests: int, - max_input_len: int, - max_output_len: int, - tokenizer: PreTrainedTokenizerBase, - use_dummy_text: bool, -) -> List[Tuple[str, int, int]]: - """Samples requests from the dataset or creates dummy requests.""" - if use_dummy_text: - dummy_prompt_token_ids = [0] * max_input_len - dummy_prompt = tokenizer.decode(dummy_prompt_token_ids) - dummy_requests = [( - dummy_prompt, - max_input_len, - max_output_len, - )] * num_requests - return dummy_requests - - # Load the dataset. - with open(dataset_path) as f: - dataset = json.load(f) - # Filter out the conversations with less than 2 turns. - dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Only keep the first two turns of each conversation. - dataset = [ - (data["conversations"][0]["value"], data["conversations"][1]["value"]) - for data in dataset - ] - - # Tokenize the prompts and completions. - prompts = [prompt for prompt, _ in dataset] - prompt_token_ids = tokenizer(prompts).input_ids - completions = [completion for _, completion in dataset] - completion_token_ids = tokenizer(completions).input_ids - tokenized_dataset = [] - for i in range(len(dataset)): - output_len = len(completion_token_ids[i]) - tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len)) - - # Filter out too long sequences. - filtered_dataset: List[Tuple[str, int, int]] = [] - for prompt, prompt_token_ids, output_len in tokenized_dataset: - prompt_len = len(prompt_token_ids) - if prompt_len < MIN_SEQ_LEN or output_len < MIN_SEQ_LEN: - # Prune too short sequences. - # This is because TGI causes errors when the input or output length - # is too short. - continue - if prompt_len > max_input_len or output_len > max_output_len: - # Prune too long sequences. - continue - filtered_dataset.append((prompt, prompt_len, output_len)) +class MetricSummary(TypedDict, total=False): + short_name: Optional[str] + name: str + description: str + mean: float + median: Optional[float] + sd: Optional[float] + min: Optional[float] + max: Optional[float] + p90: Optional[float] + p99: Optional[float] + +class BenchmarkingStepReport(TypedDict): + """Result for one step""" + request_rate: float + timestamp_start: float + timestamp_end: float + num_prompts_attempted: int + latencies: List + local_metrics: List[MetricSummary] + server_metrics: Optional[List[MetricSummary]] + errors: Dict[str, int] + +class BenchmarkingReport(): + """Results for all steps for a single model""" + args: argparse.Namespace + config: BenchmarkConfig + steps: List[BenchmarkingStepReport] + + def __init__(self, args : argparse.Namespace, model: str, start_time: float): + self.args = args + self.config = BenchmarkConfig( + model = model, + model_server = args.backend, + start_time = start_time + ) + self.steps = [] + + def record_metrics_for_step( + self, + request_rate: float, + timestamp_start: float, + timestamp_end: float, + num_prompts_attempted : int, + latencies: List, + errors: Dict[str, int], + ): + def get_metrics_to_scrape(backend: str) -> List[str]: + if backend == "vllm": + return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"] + elif backend == "jetstream": + return [ + "jetstream_slots_used_percentage", + "jetstream_prefill_backlog_size", + ] + else: + return [] + + def metric_sumamry_from_points(name: str, description: str, points : List[float], short_name: Optional[str] = None) -> MetricSummary: + mean = np.mean(points) if points else 0 + median = np.median(points) if points else 0 + sd = np.std(points) if points else 0 + min = np.min(points) if points else 0 + max = np.max(points) if points else 0 + p90 = np.percentile(points, 90) if points else 0 + p99 = np.percentile(points, 99) if points else 0 + + return MetricSummary( + short_name = short_name if short_name is not None else name, + name = name, + description = description, + mean = float(mean), + median = float(median), + sd = float(sd), + min = float(min), + max = float(max), + p90 = float(p90), + p99 = float(p99) + ) + + total_time = (timestamp_end - timestamp_start)/ NS_IN_SEC + if self.args.scrape_server_metrics: + server_metrics = fetch_metrics_from_gmp(get_metrics_to_scrape(self.args.backend), total_time, self.args.backend) + + self.steps.append(BenchmarkingStepReport( + request_rate = request_rate, + timestamp_start = timestamp_start, + timestamp_end = timestamp_end, + num_prompts_attempted = num_prompts_attempted, + latencies = latencies, + errors = errors, + local_metrics = [ + metric_sumamry_from_points( + name="per_token_latency", + description="seconds/token (includes waiting time on server)", + points=[latency / (prompt_len + output_len) for prompt_len, output_len, latency in latencies]), + metric_sumamry_from_points( + name="latency", + description="milliseconds/request (includes waiting time on server)" , + points=[1000 * latency for _, _, latency in latencies]), + metric_sumamry_from_points( + short_name="tpot", + name="per_output_token_latency", + description="milliseconds/output_token (includes waiting time on server)", + points=[1000 * latency / output_len for _, output_len, latency in latencies]), + metric_sumamry_from_points( + name="input_length", + description="input length", + points=[float(prompt_len) for prompt_len, _, _ in latencies]), + metric_sumamry_from_points( + name="output_length", + description="output length", + points=[float(output_len) for _, output_len, _ in latencies]), + MetricSummary( + name = "throughput", + description = "throughput", + mean = (len(latencies) / ((timestamp_end - timestamp_start) / NS_IN_SEC)), + ), + ], + server_metrics = server_metrics + )) + + # Each element in the output list is a report for each step + def to_text_reports(self, write_to_files: bool = False) -> List[str]: + output : Dict[str, str] = {} + required_stats = ["latency", "throughput", "input_length", "output_length", "per_output_token_latency"] + for step in self.steps: + if not all(required_stat in [metric['name'] for metric in step['local_metrics']] for required_stat in required_stats): + raise Exception(f"All of the following stats must be recorded: {required_stats}") + + for step in self.steps: + step_output : List[str] = [] + total_time = (step['timestamp_end'] - step['timestamp_start']) / NS_IN_SEC + total_output_tokens = np.sum([output_len for _, output_len, _ in step['latencies']]) + output_tokens_per_second = total_output_tokens / total_time + output_tokens_per_min = 60 * output_tokens_per_second + + total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in step['latencies']]) + input_tokens_per_min = 60 * total_input_tokens / total_time + + total_tokens = total_input_tokens + total_output_tokens + tokens_per_min = 60 * total_tokens / total_time + step_output.append(f"====Result for Model: {self.config['model']}====") + step_output.append(f"Errors: {step['errors']}") + step_output.append(f"Total time: {total_time:.2f} s") + step_output.append(f"Successful/total requests: {len(step['latencies'])}/{step['num_prompts_attempted']}") + step_output.append(f"Requests/min: {60 * step['num_prompts_attempted'] / total_time:.2f}") + step_output.append(f"Output_tokens/min: {output_tokens_per_min:.2f}") + step_output.append(f"Input_tokens/min: {input_tokens_per_min:.2f}") + step_output.append(f"Tokens/min: {tokens_per_min:.2f}") + + if self.args.machine_cost: + step_output.append( + f"Cost $/1k tokens: {self.args.machine_cost * 1000 / (60 * output_tokens_per_min)}" + ) + for metric in step['local_metrics']: + step_output.append(f"Average {metric['description']}:" f" {metric['mean']:.2f}") + output_filename = f"latency-profile-{datetime.fromtimestamp(step['timestamp_start'] / NS_IN_SEC).strftime('%Y-%m-%d_%H-%M-%S')}.txt" + output[output_filename] = '\n'.join(step_output) + if write_to_files: + with open(output_filename, 'w') as file: + file.write(output[output_filename]) + return list(output.values()) + + # The output is a a single json summary of all steps + def to_json_report(self, write_to_file: bool = False) -> Dict: + output = { + "config": { + "num_models": len(self.args.models) if self.args.save_aggregated_result else 1, + "start_time": { + "seconds" : self.steps[0]["timestamp_start"] // NS_IN_SEC, + "nanos" : self.steps[0]["timestamp_start"] % NS_IN_SEC, + }, + **self.config, + }, + "summary_stats": { + "stats": [ + { + "request_rate": step["request_rate"], + **{metric["short_name"]: metric for metric in step["local_metrics"] if "short_name" in metric}, + "model_server_metrics": [ + {"name": server_metric["name"], **server_metric} + for server_metric in step["server_metrics"] + ] if step["server_metrics"] is not None else [] + } + for step in self.steps + ] + }, + + # Legacy use case, use config if possible + "dimensions": { + "date": self.args.start_datetime.strftime('%Y%m%d-%H%M%S'), + "backend": self.args.backend, + "model_id": self.config['model'], + "tokenizer_id": self.args.tokenizer, + } if len(self.steps) == 1 else None, + # Legacy use case, use summary_stats if possible + "metrics" : { + # Traffic + "num_prompts_attempted": 0, + "num_prompts_succeeded": 0, + "request_rate": self.steps[0]['request_rate'], + } if len(self.steps) == 1 else None, + } + + if write_to_file: + model_without_slash = self.config['model'].replace("/","-") + file_name = ( + f"{self.args.file_prefix}-{self.args.backend}-{self.args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json" + ) + with open(file_name, "w", encoding="utf-8") as outfile: + json.dump(output, outfile) + return output + +class Backend(ABC): + """ + An abstract base class for Backend that defines the interface + for new model server backends. + """ + + def request(self): + print() - # Sample the requests. - sampled_requests = random.sample(filtered_dataset, num_requests) - return sampled_requests + @abstractmethod + def create_request_payload(self, + api_url: str, + prompt: str, + prompt_len: int, + output_len: int, + best_of: int, + use_beam_search: bool, + top_k: int, + tokenizer: PreTrainedTokenizerBase, + sax_model: str, + model: str) -> Dict: + pass + def tokens_from_response(self, response: Dict): + return "" + + @property + @abstractmethod + def server_metrics(self) -> List[str]: + pass -async def get_request( + @property + @abstractmethod + def api_url(self) -> str: + pass + +class vLLMBackend(Backend): + def server_metrics(self) -> List[str]: + return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"] + def api_url(self) -> str: + return "v1/completions" + def create_request_payload(self, + prompt: str, + output_len: int, + best_of: int, + use_beam_search: bool, + model: str): + return { + "model": model, + "prompt": prompt, + "n": 1, + "best_of": best_of, + "use_beam_search": use_beam_search, + "temperature": 0.0 if use_beam_search else 1.0, + "top_p": 1.0, + "max_tokens": output_len, + "ignore_eos": False, + "stream": False, + } + def tokens_from_response(self, response : Dict): + return response["choices"][0]["text"] + +class JetstreamBackend(Backend): + def server_metrics(self) -> List[str]: + return [ + "jetstream_slots_used_percentage", + "jetstream_prefill_backlog_size", + ] + def api_url(self) -> str: + return "" + def create_request_payload(self, + prompt: str, + output_len: int): + return { + "prompt": prompt, + "max_tokens": output_len, + } + def tokens_from_response(self, response: Dict): + return response["response"] + +class TgiBackend(Backend): + def server_metrics(self) -> List[str]: + return [""] + def api_url(self) -> str: + return "" + def create_request_payload(self, + prompt: str, + output_len: int, + best_of: int): + return { + "inputs": prompt, + "parameters": { + "best_of": best_of, + "max_new_tokens": output_len, + "do_sample": True, + }, + } + def tokens_from_response(self, response: Dict): + return response["generated_text"] + +class NaiveTransformersBackend(Backend): + def server_metrics(self) -> List[str]: + return [""] + def api_url(self) -> str: + return "" + def create_request_payload(self, + prompt: str, + output_len: int, + top_k: int,): + return { + "instances": [{ + "prompt": prompt, + "max_length": output_len, + "top_k": top_k, + }] + } + def tokens_from_response(self, response: Dict): + complete_pred = response["predictions"][0][0]["generated_text"] + new_text_start_index = complete_pred.find(NEW_TEXT_KEY) + len(NEW_TEXT_KEY) + return complete_pred[new_text_start_index:] + +class TensorrtLlmTritonBackend(Backend): + def server_metrics(self) -> List[str]: + return [""] + def api_url(self) -> str: + return "" + def create_request_payload(self, + prompt: str, + output_len: int, + best_of: int, + use_beam_search: bool): + return { + "text_input": prompt, + "max_tokens": output_len, + "beam_width": 1 if not use_beam_search else best_of, + "temperature": 0.0 if use_beam_search else 1.0, + "top_p": 1.0, + "bad_words": "", + "stop_words": "", + "stream": False, + } + def tokens_from_response(self, response: Dict): + return response["text_output"] + +class SaxBackend(Backend): + def server_metrics(self) -> List[str]: + return [""] + def api_url(self) -> str: + return "" + def create_request_payload(self, + prompt: str, + output_len: int, + best_of: int, + use_beam_search: bool, + sax_model: str): + return { + "model": sax_model, + "prompt": prompt, + "n": 1, + "best_of": best_of, + "use_beam_search": use_beam_search, + "temperature": 0.0 if use_beam_search else 1.0, + "top_p": 1.0, + "top_k": 50, + "max_tokens": output_len, + "stream": False, + } + def tokens_from_response(self, response: Dict): + return response["choices"][0]["text"] + +def init_errors_map() -> Dict[str, int]: + errors = { + "ClientConnectorError": 0, + "TimeoutError": 0, + "ContentTypeError": 0, + "ClientOSError": 0, + "ServerDisconnectedError": 0, + "unknown_error": 0, + } + return errors + +def getBackend(backend: str) -> Backend: + if backend == "vllm": + return vLLMBackend() + elif backend == "tgi": + return TgiBackend() + elif backend == "naive_transformers": + return NaiveTransformersBackend() + elif backend == "tensorrt_llm_triton": + return TensorrtLlmTritonBackend() + elif backend == "sax": + return SaxBackend() + elif backend == "jetstream": + return JetstreamBackend() + else: + raise ValueError("Unsupported backend") + + +def fetch_metrics_from_gmp(metrics: List[str], duration: float, backend: str) -> List[MetricSummary]: + """Gets summaries for metrics queried from GMP, queries vary per model server""" + + # Creates a credentials object from the default service account file + # Assumes that script has appropriate default credentials set up, ref: + # https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials + credentials, project_id = google.auth.default() + # Prepare an authentication request - helps format the request auth token + auth_req = google.auth.transport.requests.Request() + + # Request refresh tokens + credentials.refresh(auth_req) + url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/metadata' % (project_id) + headers_api = {'Authorization': 'Bearer ' + credentials.token} + request_post = requests.get(url=url, headers=headers_api) + all_metrics_metadata = request_post.json() + if request_post.ok is not True: + print("HTTP Error: %s" % (all_metrics_metadata)) + return [] + if all_metrics_metadata["status"] != "success": + print("Metadata error response: %s" % all_metrics_metadata["error"]) + return [] + + metrics_list : List[MetricSummary] = [] + for metric in metrics: + print("Metric Name: %s" % (metric)) + + # Find metric type + metric_type = all_metrics_metadata['data'][metric] + if all_metrics_metadata['data'][metric] is None: + print("No metric found for: %s" % metric) + return [] + metric_type = metric_type[0]['type'] + + metric_results = {} + # Queries scrape all metrics collected from the last $DURATION seconds from the backend's related + # podmonitoring spec assumed to be named "$BACKEND-podmonitoring" + queries = { + "gauge": { + "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "Sd": "stddev_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "Min": "min_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "Max": "max_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + }, + "histogram": { + "Mean": "sum(rate(%s_sum{job='%s-podmonitoring'}[%.0fs])) / sum(rate(%s_count{job='%s-podmonitoring'}[%.0fs]))" % (metric, backend, duration, metric, backend, duration), + "Median": "histogram_quantile(0.5, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration), + "Min": "histogram_quantile(0, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration), + "Max": "histogram_quantile(1, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration), + "P90": "histogram_quantile(0.9, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration), + "P99": "histogram_quantile(0.99, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration), + } + } + + metric_data : MetricSummary = { + "name": metric, + "description": f"Metrics for {metric} from {backend} backend", + } + for query_name, query in queries[metric_type].items(): + + # Configure respective query + url = f'https://monitoring.googleapis.com/v1/projects/{project_id}/location/global/prometheus/api/v1/query' + headers_api = {'Authorization': f'Bearer {credentials.token}'} + params = {'query': query} + + request_post = requests.get(url=url, headers=headers_api, params=params) + response = request_post.json() + + # handle response + if request_post.ok: + if response["status"] == "success": + metric_results[query_name] = float(response["data"]["result"][0]["value"][1]) + print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1])) + else: + print("Cloud Monitoring PromQL Error: %s" % (response["error"])) + else: + print("HTTP Error: %s" % (response)) + + # Handle response + if request_post.ok and response["status"] == "success": + result_value = float(response["data"]["result"][0]["value"][1]) + if query_name == "Mean": + metric_data["mean"] = result_value + elif query_name == "Median": + metric_data["median"] = result_value + elif query_name == "Sd": + metric_data["sd"] = result_value + elif query_name == "Min": + metric_data["min"] = result_value + elif query_name == "Max": + metric_data["max"] = result_value + elif query_name == "P90": + metric_data["p90"] = result_value + elif query_name == "P99": + metric_data["p99"] = result_value + else: + error_message = response.get("error", "HTTP Error") + print(f"Error fetching {query_name} for {metric}: {error_message}") + + metrics_list.append(metric_data) + return metrics_list + +async def generate_next_request( input_requests: List[Tuple[str, int, int]], request_rate_expr: str, start_time: float, ) -> AsyncGenerator[Tuple[str, int, int], None]: """Gets request async.""" - for request in input_requests: + request = random.choice(input_requests) + while True: yield request if request_rate_expr == "oo": @@ -120,17 +584,6 @@ async def get_request( # The next request will be sent after the interval. await asyncio.sleep(interval) -def init_errors_map() -> Dict[str, int]: - errors = { - "ClientConnectorError": 0, - "TimeoutError": 0, - "ContentTypeError": 0, - "ClientOSError": 0, - "ServerDisconnectedError": 0, - "unknown_error": 0, - } - return errors - async def send_request( backend: str, api_url: str, @@ -163,7 +616,6 @@ async def send_request( "stream": False, } elif backend == "tgi": - assert not use_beam_search params = { "best_of": best_of, "max_new_tokens": output_len, @@ -221,7 +673,7 @@ async def send_request( while True: try: async with session.post(api_url, headers=headers, json=pload, ssl=False) as response: - output = await response.json() + output = await response.json() # Re-send the request if it failed. if "error" not in output: @@ -285,316 +737,178 @@ async def send_request( return request_latency, None +def get_filtered_dataset( + dataset_path: str, + max_input_len: int, + max_output_len: int, + tokenizer: PreTrainedTokenizerBase, + use_dummy_text: bool, +) -> List[Tuple[str, int, int]]: + """Gets a subset of the dataset where all elements adhere to the specified constraints""" + if use_dummy_text: + dummy_prompt_token_ids = [0] * max_input_len + dummy_prompt = tokenizer.decode(dummy_prompt_token_ids) + return [( + dummy_prompt, + max_input_len, + max_output_len, + )] + + # Load the dataset. + with open(dataset_path) as f: + dataset = json.load(f) + # Filter out the conversations with less than 2 turns. + dataset = [data for data in dataset if len(data["conversations"]) >= 2] + # Only keep the first two turns of each conversation. + dataset = [ + (data["conversations"][0]["value"], data["conversations"][1]["value"]) + for data in dataset + ] + + # Tokenize the prompts and completions. + prompts = [prompt for prompt, _ in dataset] + prompt_token_ids = tokenizer(prompts).input_ids + completions = [completion for _, completion in dataset] + completion_token_ids = tokenizer(completions).input_ids + tokenized_dataset = [] + for i in range(len(dataset)): + output_len = len(completion_token_ids[i]) + tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len)) + + # Filter out too long sequences. + filtered_dataset: List[Tuple[str, int, int]] = [] + for prompt, prompt_token_ids, output_len in tokenized_dataset: + prompt_len = len(prompt_token_ids) + if prompt_len < MIN_SEQ_LEN or output_len < MIN_SEQ_LEN: + # Prune too short sequences. + # This is because TGI causes errors when the input or output length + # is too short. + continue + if prompt_len > max_input_len or output_len > max_output_len: + # Prune too long sequences. + continue + filtered_dataset.append((prompt, prompt_len, output_len)) + + return filtered_dataset + async def benchmark( args: argparse.Namespace, api_url: str, tokenizer: PreTrainedTokenizerBase, model: str, -) -> Tuple[List[Tuple[int, int, float]], Dict[str, int]]: +) -> BenchmarkingReport: """Runs benchmark with asynchronous requests.""" - input_requests = sample_requests( + input_requests = get_filtered_dataset( args.dataset, - args.num_prompts, args.max_input_length, args.max_output_length, tokenizer, args.use_dummy_text, ) - benchmark_start_time = time.time_ns() - tasks: List[asyncio.Task] = [] - async for request in get_request(input_requests, args.request_rate, benchmark_start_time): - prompt, prompt_len, output_len = request - task = asyncio.create_task( - send_request( - args.backend, - api_url, - prompt, - prompt_len, - output_len, - args.best_of, - args.use_beam_search, - args.top_k, - tokenizer, - args.sax_model, - model, - ) - ) - tasks.append(task) - results = await asyncio.gather(*tasks) - combined_latencies = [] - combined_errors = init_errors_map() - for latency, errors in results: - if latency: - combined_latencies.append(latency) - if errors: - for err, count in errors.items(): - combined_errors[err] = combined_errors[err] + count + benchmark_results = BenchmarkingReport(args, model, time.time_ns()) + + all_steps = {} + if args.job is not None: + all_steps = args.job + elif args.num_prompts is not None: + all_steps = { + "steps": [{ + "rate": args.request_rate, + "max_num_prompts": args.num_prompts, + }] + } - benchmark_duration = (time.time_ns() - benchmark_start_time) / NS_IN_SEC - print_and_save_result(args, benchmark_duration, len(input_requests), model, combined_latencies, combined_errors) - return combined_latencies, combined_errors + for index, step in enumerate(all_steps["steps"]): + + # No need to sleep before running the first step + if 'time_between_steps' in args.job and index != 0: + print(f"Sleeping for {args.job['time_between_steps']} sec...") + await asyncio.sleep(args.job["time_between_steps"]) + max_prompts = f" {step['max_num_prompts']} requests" if 'max_num_prompts' in step else " " + duration = f" {step['time']} sec" if 'time' in step else " " + print(f"Starting benchmarking{max_prompts} at {step['rate']} requests/sec for{duration}") + tasks: List[asyncio.Task] = [] + prompts_sent_this_step: int = 0 + step_start_timestamp = time.time_ns() + async for request in generate_next_request(input_requests, str(step["rate"]), step_start_timestamp): + # Stop conditions + if "max_num_prompts" in step and prompts_sent_this_step >= step["max_num_prompts"]: + break + if "time" in step and ((time.time_ns() - step_start_timestamp ) / NS_IN_SEC) > step["time"]: + break -def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics, model, errors): - # Setup - start_dt_proto = Timestamp() - start_dt_proto.FromDatetime(args.start_datetime) + prompt, prompt_len, output_len = request + task = asyncio.create_task( + send_request( + args.backend, + api_url, + prompt, + prompt_len, + output_len, + args.best_of, + args.use_beam_search, + args.top_k, + tokenizer, + args.sax_model, + model, + ) + ) + tasks.append(task) + prompts_sent_this_step += 1 - final_json = { - # metrics values are numerical - "metrics" : { - # Traffic - "num_prompts_attempted": benchmark_result['num_prompts_attempted'], - "num_prompts_succeeded": benchmark_result['num_prompts_succeeded'], - "request_rate": args.request_rate, - 'server_metrics': { - **server_metrics - }, - **benchmark_result, - **errors, - }, - # dimensions values are strings - "dimensions": { - "date": args.start_datetime.strftime('%Y%m%d-%H%M%S'), - "backend": args.backend, - "model_id": model, - "tokenizer_id": args.tokenizer, - **(json.loads(args.additional_metadata_metrics_to_save) if args.additional_metadata_metrics_to_save else {}) - }, - "config": { - "model": model, - "num_models": len(args.models.split(',')), - "model_server": args.backend, - "start_time": { - "seconds" : start_dt_proto.seconds, - "nanos" : start_dt_proto.nanos - } - }, - "summary_stats": { - "stats": [{ - "request_rate": args.request_rate, - "request_latency": { - "mean": benchmark_result["avg_latency"], - "median": benchmark_result["median_latency"], - "sd": benchmark_result["sd_latency"], - "min": benchmark_result["min_latency"], - "max": benchmark_result["max_latency"], - "p90": benchmark_result["p90_latency"], - "p99": benchmark_result["p99_latency"], - }, - "throughput": { - "mean": benchmark_result['throughput'] - }, - "input_length": { - "mean": benchmark_result["avg_input_len"], - "median": benchmark_result["median_input_len"], - "sd": benchmark_result["sd_input_len"], - "min": benchmark_result["min_input_len"], - "max": benchmark_result["max_input_len"], - "p90": benchmark_result["p90_input_len"], - "p99": benchmark_result["p99_input_len"], - }, - "output_length": { - "mean": benchmark_result["avg_output_len"], - "median": benchmark_result["median_output_len"], - "sd": benchmark_result["sd_output_len"], - "min": benchmark_result["min_output_len"], - "max": benchmark_result["max_output_len"], - "p90": benchmark_result["p90_output_len"], - "p99": benchmark_result["p99_output_len"], - }, - "tpot": { - "mean": benchmark_result["avg_per_output_token_latency"], - "median": benchmark_result["median_per_output_token_latency"], - "sd": benchmark_result["sd_per_output_token_latency"], - "min": benchmark_result["min_per_output_token_latency"], - "max": benchmark_result["max_per_output_token_latency"], - "p90": benchmark_result["p90_per_output_token_latency"], - "p99": benchmark_result["p99_per_output_token_latency"], - }, - "model_server_metrics" : [{"Name": name, **metrics} for name, metrics in server_metrics.items()] - }] - } - } + print("All requests sent, awaiting responses...") + results = await asyncio.gather(*tasks) + step_end_timestamp = time.time_ns() + print(f"Finished benchmarking step {index + 1}") + + all_latencies = [] + all_errors = init_errors_map() + for latency, errors in results: + if latency: + all_latencies.append(latency) + if errors: + for err, count in errors.items(): + all_errors[err] = all_errors[err] + count + benchmark_results.record_metrics_for_step(step['rate'], step_start_timestamp, step_end_timestamp, prompts_sent_this_step, all_latencies, all_errors) - # Save to file - model_without_slash = model.replace("/","-") - file_name = ( - f"{args.file_prefix}-{args.backend}-{args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json" - ) - with open(file_name, "w", encoding="utf-8") as outfile: - json.dump(final_json, outfile) + print(f"Completed all steps, generating reports...") + return benchmark_results -def metrics_to_scrape(backend: str) -> List[str]: - # Each key in the map is a metric, it has a corresponding 'stats' object - # It must be populated on the outputs 'metrics' field as 'key':'stats' - # If a value is specified for a given key, it will be populated on the outputs `summary_stats.stats` field as 'value':'stats' as well. - if backend == "vllm": - return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"] - elif backend == "jetstream": - return [ - "jetstream_slots_used_percentage", - "jetstream_prefill_backlog_size", - ] - else: - return [] - -def print_metrics(metrics: List[str], duration: float, backend: str): - # Creates a credentials object from the default service account file - # Assumes that script has appropriate default credentials set up, ref: - # https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials - credentials, project_id = google.auth.default() - # Prepare an authentication request - helps format the request auth token - auth_req = google.auth.transport.requests.Request() - - server_metrics = {} - - # Request refresh tokens - credentials.refresh(auth_req) - url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/metadata' % (project_id) - headers_api = {'Authorization': 'Bearer ' + credentials.token} - request_post = requests.get(url=url, headers=headers_api) - all_metrics_metadata = request_post.json() - if request_post.ok is not True: - print("HTTP Error: %s" % (all_metrics_metadata)) - if all_metrics_metadata["status"] != "success": - print("Metadata error response: %s" % all_metrics_metadata["error"]) - - for metric in metrics: - print("Metric Name: %s" % (metric)) - - # Find metric type - metric_type = all_metrics_metadata['data'][metric] - if all_metrics_metadata['data'][metric] is None: - print("No metric found for: %s" % metric) - return - metric_type = metric_type[0]['type'] - - metric_results = {} - # Queries scrape all metrics collected from the last $DURATION seconds from the backend's related - # podmonitoring spec assumed to be named "$BACKEND-podmonitoring" - queries = { - "gauge": { - "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), - "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), - "Sd": "stddev_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), - "Min": "min_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), - "Max": "max_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), - "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), - "P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), - }, - "histogram": { - "Mean": "sum(rate(%s_sum{job='%s-podmonitoring'}[%.0fs])) / sum(rate(%s_count{job='%s-podmonitoring'}[%.0fs]))" % (metric, backend, duration, metric, backend, duration), - "Median": "histogram_quantile(0.5, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration), - "Min": "histogram_quantile(0, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration), - "Max": "histogram_quantile(1, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration), - "P90": "histogram_quantile(0.9, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration), - "P99": "histogram_quantile(0.99, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration), - } - } - for query_name, query in queries[metric_type].items(): - # Configure respective query - url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/query' % (project_id) - headers_api = {'Authorization': 'Bearer ' + credentials.token} - params = {'query': query} - request_post = requests.get(url=url, headers=headers_api, params=params) - response = request_post.json() - - # handle response - if request_post.ok: - if response["status"] == "success": - metric_results[query_name] = float(response["data"]["result"][0]["value"][1]) - print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1])) - else: - print("Cloud Monitoring PromQL Error: %s" % (response["error"])) - else: - print("HTTP Error: %s" % (response)) - server_metrics[metric] = metric_results - return server_metrics - -def get_stats_for_set(name, description, points): - avg = np.mean(points) if points else 0 - median = np.median(points) if points else 0 - sd = np.std(points) if points else 0 - min = np.min(points) if points else 0 - max = np.max(points) if points else 0 - p90 = np.percentile(points, 90) if points else 0 - p99 = np.percentile(points, 99) if points else 0 - - print(f"Average {description}:" f" {avg:.2f}") - - return { - f'avg_{name}': avg, - f'median_{name}': median, - f'sd_{name}': sd, - f'min_{name}': min, - f'max_{name}': max, - f'p90_{name}': p90, - f'p99_{name}': p99, +def aggregate_benchmark_reports(reports: List[BenchmarkingReport]) -> BenchmarkingReport: + """When benchmarking multiple models we will generate a BenchmarkingReport for each.""" + """If `save_aggregated_result` is set, we aggregate these into a single report.""" + + aggregated_step_report = { + "request_rate": reports[0].steps[0]["request_rate"], + "timestamp_start": 0.0, + "timestamp_end": 0.0, + "num_prompts_attempted": 0, + "latencies": [], + "server_metrics": [], + "errors": {}, } -def print_and_save_result(args: argparse.Namespace, benchmark_duration, total_requests, model, request_latencies, errors): - benchmark_result = {} - - print(f"====Result for Model: {model}====") - print(f"Errors: {errors}") - print(f"Total time: {benchmark_duration:.2f} s") - print(f"Successful/total requests: {len(request_latencies)}/{total_requests}") - print(f"Requests/min: {60 * total_requests / benchmark_duration:.2f}") - benchmark_result["num_prompts_attempted"] = total_requests - benchmark_result["num_prompts_succeeded"] = len(request_latencies) - benchmark_result['benchmark_time'] = benchmark_duration - benchmark_result['throughput_rps'] = (args.num_prompts / benchmark_duration) - - total_output_tokens = np.sum([output_len for _, output_len, _ in - request_latencies]) - output_tokens_per_second = total_output_tokens / benchmark_duration - benchmark_result['throughput'] = output_tokens_per_second - - output_tokens_per_min = 60 * output_tokens_per_second - print(f"Output_tokens/min: {output_tokens_per_min:.2f}") - benchmark_result['total_output_token'] = int(total_output_tokens) - benchmark_result['output_tokens_per_min'] = output_tokens_per_min - - total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in - request_latencies]) - input_tokens_per_min = 60 * total_input_tokens / benchmark_duration - print(f"Input_tokens/min: {input_tokens_per_min:.2f}") - benchmark_result['total_input_tokens'] = int(total_input_tokens) - benchmark_result['input_tokens_per_min'] = input_tokens_per_min - - total_tokens = total_input_tokens + total_output_tokens - tokens_per_min = 60 * total_tokens / benchmark_duration - print(f"Tokens/min: {tokens_per_min:.2f}") - benchmark_result['total_tokens'] = int(total_tokens) - benchmark_result['tokens_per_min'] = tokens_per_min - - if args.machine_cost: - print( - "Cost $/1k tokens:" - f" {args.machine_cost * 1000 / (60 * output_tokens_per_min)}" - ) + def accumulate_errors(errors_list: List[Dict[str, int]]) -> Dict[str, int]: + accumulated_errors = init_errors_map() + for errors in errors_list: + for error_type, count in errors.items(): + accumulated_errors[error_type] += count + return accumulated_errors - benchmark_result = { - **benchmark_result, - **(get_stats_for_set("per_token_latency", "seconds/token (includes waiting time on server)", [ - latency / (prompt_len + output_len) - for prompt_len, output_len, latency in request_latencies - ])), - - # NOTE: The latency below includes requests awaiting time on server side. - # It's not comparable with the model inference latency for batch size 1. - **(get_stats_for_set("latency", "milliseconds/request (includes waiting time on server)" ,[1000 * latency for _, _, latency in request_latencies])), - **(get_stats_for_set("per_output_token_latency", "milliseconds/output_token (includes waiting time on server)", [1000 * latency / output_len for _, output_len, latency in request_latencies])), - **(get_stats_for_set("input_len", "input length", [float(prompt_len) for prompt_len, _, _ in request_latencies])), - **(get_stats_for_set("output_len", "output length", [float(output_len) for _, output_len, _ in request_latencies])) - } + for report in reports: + # Input metavalidation asserts this report only has one step report + report = report.steps[0] + aggregated_step_report["timestamp_start"] = min(aggregated_step_report["timestamp_start"], report["timestamp_start"]) + aggregated_step_report["timestamp_end"] = max(aggregated_step_report["timestamp_end"], report["timestamp_end"]) + aggregated_step_report["num_prompts_attempted"] += report["num_prompts_attempted"] + aggregated_step_report["latencies"].extend(report["latencies"]) + aggregated_step_report["errors"] = accumulate_errors([aggregated_step_report["errors"], report["errors"]]) - server_metrics = {} - if args.scrape_server_metrics: - server_metrics = print_metrics(metrics_to_scrape(args.backend), benchmark_duration, args.backend) - if args.save_json_results: - save_json_results(args, benchmark_result, server_metrics, model, errors) + aggregated_report = BenchmarkingReport(reports[0].args, f"ALL-{len(reports)}-MODELS", aggregated_step_report["timestamp_start"]) + aggregated_report.record_metrics_for_step(**aggregated_step_report) + + return aggregated_report async def main(args: argparse.Namespace): print(args) @@ -606,24 +920,7 @@ async def main(args: argparse.Namespace): "v1/completions" if args.backend == "vllm" else args.endpoint -) - - # Input assertions - def is_expression_of_t(expression): - # Check if expression uses variables other than 't' by attempting to evaluate with only 't' defined - try: - t = symbols('t') - expr_parsed = parse_expr(expression, transformations="all", local_dict={"t": t}) - expr_parsed.subs(t, 1) - return True - except KeyError as e: - # If another variable is required, throw a KeyError - return False - - if args.request_rate == "inf": - args.request_rate = "oo" - if not is_expression_of_t(args.request_rate): - raise ValueError(f"Request rate {args.request_rate}, must be an expression of `t`") + ) print(f"Starting Prometheus Server on port {PROMETHEUS_PORT}") start_http_server(PROMETHEUS_PORT) @@ -632,33 +929,33 @@ def is_expression_of_t(expression): tokenizer = AutoTokenizer.from_pretrained( args.tokenizer, trust_remote_code=args.trust_remote_code ) - - benchmark_start_time = time.time_ns() - args.start_datetime = datetime.fromtimestamp(benchmark_start_time / NS_IN_SEC) + args.start_datetime = datetime.fromtimestamp(time.time_ns() / NS_IN_SEC) - results = await asyncio.gather( - *[benchmark(args, api_url, tokenizer, model) for model in models] - ) - - # Summarize results - combined_latencies = [] - combined_errors = { - "ClientConnectorError": 0, - "TimeoutError": 0, - "ContentTypeError": 0, - "ClientOSError": 0, - "unknown_error": 0, - "ServerDisconnectedError": 0, - } - for latencies, errors in results: - combined_latencies.extend(latencies) - for k, v in errors.items(): - combined_errors[k] = combined_errors[k] + v - - benchmark_duration_all_models = (time.time_ns() - benchmark_start_time) / NS_IN_SEC + reports : List[BenchmarkingReport] = await asyncio.gather( + *[benchmark(args, api_url, tokenizer, model) for model in models] + ) + if args.save_aggregated_result: - print_and_save_result(args, benchmark_duration_all_models, len(models)*args.num_prompts, f"ALL-{len(models)}-MODELS", combined_latencies, combined_errors) + aggregated_benchmark = aggregate_benchmark_reports(reports) + aggregated_benchmark.to_text_reports(write_to_files=True) + aggregated_benchmark.to_json_report(write_to_file=args.save_json_results) + else: + for report in reports: + report.to_text_reports(write_to_files=True) + report.to_json_report(write_to_file=args.save_json_results) + +def input_metavalidation(args: argparse.Namespace): + """Validate a correct combination of arguments is set""" + + if sum([bool(args.request_rate is not None and args.num_prompts is not None), bool(args.job is not None)]) != 1: + raise ValueError("All args must be set for one and only one of the following sets of arguments: {--request-rate, --num-prompts} or {--job}") + if args.save_aggregated_result and args.benchmark is not None and len(args.benchmark) != 1 and args.models is not None and len(args.models) > 1: + raise ValueError("Multi model benchmarking with multi step benchmarking is not supported yet") + + if args.use_beam_search and args.backend == "tgi": + raise ValueError("Beam search is not supported by TGI") + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Benchmark the online serving throughput." @@ -683,7 +980,6 @@ def is_expression_of_t(expression): help="Model name to send request to at API server for SAX model server.", ) parser.add_argument("--file-prefix", type=str, default="benchmark") - parser.add_argument("--endpoint", type=str, default="generate") parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=7080) parser.add_argument("--dataset", type=str, help="Path to the dataset.") @@ -708,7 +1004,7 @@ def is_expression_of_t(expression): parser.add_argument( "--num-prompts", type=int, - default=1000, + default=None, help="Number of prompts to process.", ) parser.add_argument( @@ -737,10 +1033,24 @@ def is_expression_of_t(expression): " LLaMA2 models." ), ) + + # Input assertions + def is_expression_of_t(input_str): + if input_str == "inf": + return "oo" + # Check if expression uses variables other than 't' by attempting to evaluate with only 't' defined + try: + t = symbols('t') + expr_parsed = parse_expr(input_str, transformations="all", local_dict={"t": t}) + expr_parsed.subs(t, 1) + return input_str + except Exception: + raise ValueError(f"Request rate {input_str}, must be an expression of `t`") + parser.add_argument( "--request-rate", - type=str, - default="inf", + type=is_expression_of_t, + default=None, help=( "Specifies the request rate as a function of time, f(t)." " Example format: '1+1.05*t', where 't' represents seconds from" @@ -750,6 +1060,80 @@ def is_expression_of_t(expression): " expression." ), ) + + def parse_request_rates(input_str): + if input_str is None: + return None + # Check if input is a filename and load its contents + if os.path.isfile(input_str): + with open(input_str, 'r') as file: + input_str = file.read() + print(input_str) + try: + # Parse the input string as JSON + request_data = json.loads(input_str) + # Validate that the JSON has the correct structure + if not isinstance(request_data, dict): + raise argparse.ArgumentTypeError("Input JSON must be an object containing 'time_between_steps' and 'steps'.") + # Check 'time_between_steps' field + if "time_between_steps" not in request_data or (not isinstance(request_data["time_between_steps"], float) and not isinstance(request_data["time_between_steps"], int)): + raise argparse.ArgumentTypeError("'time_between_steps' must be a float or int.") + # Check 'steps' field + if "steps" not in request_data or not isinstance(request_data["steps"], list): + raise argparse.ArgumentTypeError("'steps' must be a list of objects with 'rate' and 'time'.") + + # Validate each entry in the 'steps' list + for i, rate_entry in enumerate(request_data["steps"]): + if not isinstance(rate_entry, dict): + raise argparse.ArgumentTypeError(f"Entry {i} in 'steps' must be a JSON object.") + + if "rate" not in rate_entry: + raise argparse.ArgumentTypeError(f"Entry {i} in 'steps' must have a 'rate' key.") + if "time" not in rate_entry and "max_num_prompts" not in rate_entry: + raise argparse.ArgumentTypeError(f"Entry {i} in 'steps' must have a 'time' and/or 'max_num_prompts' key.") + + # Validate the 'rate' field to allow for string expressions or floats + if isinstance(rate_entry["rate"], str): + try: + is_expression_of_t(rate_entry["rate"]) # Validate the expression + except Exception as e: + raise argparse.ArgumentTypeError(f"Entry {i} in 'steps': {e}") + # Validate the 'time' field + if not isinstance(rate_entry["time"], (float, int)): + raise argparse.ArgumentTypeError(f"Entry {i} in 'steps': 'time' must be a positive float.") + return request_data + except json.JSONDecodeError as e: + raise argparse.ArgumentTypeError("Invalid JSON format") + + parser.add_argument( + "--job", + type=parse_request_rates, + default=None, + required=False, + help=( + "Specify the benchmark procedure in JSON format, either as raw JSON" + " or as a filename. \n" + " The JSON should have the following structure:\n\n" + " {\n" + " \"time_between_steps\": float (seconds to rest between rates),\n" + " \"rates\": [\n" + " {\n" + " \"rate\": float | str (as would be passed to request-rate),\n" + " \"time\": float (number of seconds for this step)\n" + " \"max_num_prompts\": int (maximum number of prompts for this step)" + " },\n" + " ...\n" + " ]\n" + " }\n\n" + " Example JSON:\n" + " '{\"time_between_steps\": 1.0, \"rates\": [{\"rate\": 2.0, \"time\": 0.0}, {\"rate\": \"1+0.5*t\", \"time\": 5.0}]}'\n\n" + " Each entry should have a 'rate' and/or 'num_prompts' and 'time' value." + " Each rate is finished when \"num_prompts\" prompts are sent" + " (if specified) and \"time\" seconds have passed (if specified)," + " whichever comes last" + ), + ) + parser.add_argument("--seed", type=int, default=int(time.time())) parser.add_argument( "--trust-remote-code", @@ -794,4 +1178,5 @@ def is_expression_of_t(expression): help="Whether to scrape server metrics.", ) cmd_args = parser.parse_args() + input_metavalidation(cmd_args) asyncio.run(main(cmd_args)) \ No newline at end of file From 82425125dcde6de3fe7f5f59f42b7c11f41ce86e Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Mon, 11 Nov 2024 23:57:51 +0000 Subject: [PATCH 14/27] intermediate change --- .../container/benchmark_serving.py | 267 ++++++++++++++---- 1 file changed, 206 insertions(+), 61 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index f7c08cc26..8cd98e956 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -5,7 +5,7 @@ It currently supports TGI, vLLM, Triton TensorRT-LLM and Saxml. """ -from abc import ABC, abstractmethod +from abc import ABC, abstractmethod, abstractproperty import argparse import asyncio from datetime import datetime @@ -263,12 +263,91 @@ class Backend(ABC): for new model server backends. """ - def request(self): - print() + async def send_request( + self, + api_url: str, + prompt: str, + prompt_len: int, + output_len: int, + best_of: int, + use_beam_search: bool, + top_k: int, + tokenizer: PreTrainedTokenizerBase, + sax_model: str, + model: str, + ) -> Tuple[Optional[Tuple[int, int, float]], Optional[Dict[str, int]]]: + """Sends request to server.""" + request_start_time = time.time() + errors = init_errors_map() + + headers = {"User-Agent": "Benchmark Client"} + pload = self.create_request_payload( + prompt=prompt, + prompt_len=prompt_len, + output_len=output_len, + best_of=best_of, + use_beam_search=use_beam_search, + top_k=top_k, + tokenizer=tokenizer, + sax_model=sax_model, + model=model, + ) + + # Set client timeout to be 3 hrs. + timeout = aiohttp.ClientTimeout(total=CLIENT_TIMEOUT_SEC) + async with aiohttp.ClientSession(timeout=timeout,trust_env=True) as session: + while True: + try: + async with session.post(f"{api_url}/{self.get_endpoint()}", headers=headers, json=pload, ssl=False) as response: + output = await response.json() + + # Re-send the request if it failed. + if "error" not in output: + break + except aiohttp.client_exceptions.ClientConnectorError as client_err: + errors["ClientConnectorError"] += 1 + print(f"ClientConnectorError: {client_err}") + return None, errors + except asyncio.TimeoutError as timeout_err: + errors["TimeoutError"] += 1 + print(f"TimeoutError: {timeout_err}") + return None, errors + except aiohttp.client_exceptions.ClientOSError as e: + errors["ClientOSError"] += 1 + print(f"ClientOSError: {e}") + return None, errors + except aiohttp.client_exceptions.ContentTypeError as e: + print(f"ContentTypeError: {e}, response: {response}") + errors["ContentTypeError"] += 1 + return None, errors + except aiohttp.client_exceptions.ServerDisconnectedError as e: + errors["ServerDisconnectedError"] += 1 + print(f"ServerDisconnectedError: {e}") + return None, errors + except Exception as e: + print(f"Unknown error {e}") + errors["unknown_error"] += 1 + return None, errors + request_end_time = time.time() + # Naive HF transformers generation and TensorRT-LLM generation stops at EOS + # tokens and the generation may be shorter than the ground-truth output + # sequence length. + output_len = self.get_response_length( + response=output, + request_len=prompt_len, + tokenizer=tokenizer + ) + + # (prompt len, output len, latency, success) + request_latency = (prompt_len, output_len, (request_end_time - request_start_time)) + tpot_metric.observe((request_end_time - request_start_time) / output_len) + prompt_length_metric.observe(prompt_len) + response_length_metric.observe(output_len) + + return request_latency, None @abstractmethod def create_request_payload(self, - api_url: str, prompt: str, prompt_len: int, output_len: int, @@ -280,29 +359,36 @@ def create_request_payload(self, model: str) -> Dict: pass - def tokens_from_response(self, response: Dict): - return "" + @abstractmethod + def get_response_length( + self, + request_len: int, + response: Dict, + tokenizer: PreTrainedTokenizerBase) -> int: + pass - @property @abstractmethod - def server_metrics(self) -> List[str]: + def get_server_metrics(self) -> List[str]: pass - @property @abstractmethod - def api_url(self) -> str: + def get_endpoint(self) -> str: pass class vLLMBackend(Backend): - def server_metrics(self) -> List[str]: + def get_server_metrics(self) -> List[str]: return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"] - def api_url(self) -> str: + def get_endpoint(self) -> str: return "v1/completions" def create_request_payload(self, prompt: str, + prompt_len: int, output_len: int, best_of: int, use_beam_search: bool, + top_k: int, + tokenizer: PreTrainedTokenizerBase, + sax_model: str, model: str): return { "model": model, @@ -316,36 +402,60 @@ def create_request_payload(self, "ignore_eos": False, "stream": False, } - def tokens_from_response(self, response : Dict): - return response["choices"][0]["text"] + def get_response_length( + self, + request_len: int, + response: Dict, + tokenizer: PreTrainedTokenizerBase): + print(response) + output_token_ids = tokenizer(response["choices"][0]["text"]).input_ids + return len(output_token_ids) class JetstreamBackend(Backend): - def server_metrics(self) -> List[str]: + def get_server_metrics(self) -> List[str]: return [ "jetstream_slots_used_percentage", "jetstream_prefill_backlog_size", ] - def api_url(self) -> str: + def get_endpoint(self) -> str: return "" def create_request_payload(self, - prompt: str, - output_len: int): + prompt: str, + prompt_len: int, + output_len: int, + best_of: int, + use_beam_search: bool, + top_k: int, + tokenizer: PreTrainedTokenizerBase, + sax_model: str, + model: str): return { "prompt": prompt, "max_tokens": output_len, } - def tokens_from_response(self, response: Dict): - return response["response"] + def get_response_length( + self, + request_len: int, + response: Dict, + tokenizer: PreTrainedTokenizerBase): + output_token_ids = tokenizer(response["response"]).input_ids + return len(output_token_ids) class TgiBackend(Backend): - def server_metrics(self) -> List[str]: + def get_server_metrics(self) -> List[str]: return [""] - def api_url(self) -> str: + def get_endpoint(self) -> str: return "" - def create_request_payload(self, - prompt: str, - output_len: int, - best_of: int): + def create_request_payload(self, + prompt: str, + prompt_len: int, + output_len: int, + best_of: int, + use_beam_search: bool, + top_k: int, + tokenizer: PreTrainedTokenizerBase, + sax_model: str, + model: str): return { "inputs": prompt, "parameters": { @@ -354,18 +464,29 @@ def create_request_payload(self, "do_sample": True, }, } - def tokens_from_response(self, response: Dict): - return response["generated_text"] + def get_response_length( + self, + request_len: int, + response: Dict, + tokenizer: PreTrainedTokenizerBase): + output_token_ids = tokenizer(response["generated_text"]).input_ids + return len(output_token_ids) class NaiveTransformersBackend(Backend): - def server_metrics(self) -> List[str]: + def get_server_metrics(self) -> List[str]: return [""] - def api_url(self) -> str: + def get_endpoint(self) -> str: return "" - def create_request_payload(self, - prompt: str, - output_len: int, - top_k: int,): + def create_request_payload(self, + prompt: str, + prompt_len: int, + output_len: int, + best_of: int, + use_beam_search: bool, + top_k: int, + tokenizer: PreTrainedTokenizerBase, + sax_model: str, + model: str): return { "instances": [{ "prompt": prompt, @@ -373,21 +494,32 @@ def create_request_payload(self, "top_k": top_k, }] } - def tokens_from_response(self, response: Dict): + def get_response_length( + self, + request_len: int, + response: Dict, + tokenizer: PreTrainedTokenizerBase): complete_pred = response["predictions"][0][0]["generated_text"] new_text_start_index = complete_pred.find(NEW_TEXT_KEY) + len(NEW_TEXT_KEY) - return complete_pred[new_text_start_index:] + pred = complete_pred[new_text_start_index:] + output_token_ids = tokenizer(pred).input_ids + return len(output_token_ids) - request_len class TensorrtLlmTritonBackend(Backend): - def server_metrics(self) -> List[str]: + def get_server_metrics(self) -> List[str]: return [""] - def api_url(self) -> str: + def get_endpoint(self) -> str: return "" def create_request_payload(self, - prompt: str, - output_len: int, - best_of: int, - use_beam_search: bool): + prompt: str, + prompt_len: int, + output_len: int, + best_of: int, + use_beam_search: bool, + top_k: int, + tokenizer: PreTrainedTokenizerBase, + sax_model: str, + model: str): return { "text_input": prompt, "max_tokens": output_len, @@ -398,20 +530,29 @@ def create_request_payload(self, "stop_words": "", "stream": False, } - def tokens_from_response(self, response: Dict): - return response["text_output"] + def get_response_length( + self, + request_len: int, + response: Dict, + tokenizer: PreTrainedTokenizerBase): + output_token_ids = tokenizer(response["text_output"]).input_ids + return len(output_token_ids) class SaxBackend(Backend): - def server_metrics(self) -> List[str]: + def get_server_metrics(self) -> List[str]: return [""] - def api_url(self) -> str: + def get_endpoint(self) -> str: return "" def create_request_payload(self, - prompt: str, - output_len: int, - best_of: int, - use_beam_search: bool, - sax_model: str): + prompt: str, + prompt_len: int, + output_len: int, + best_of: int, + use_beam_search: bool, + top_k: int, + tokenizer: PreTrainedTokenizerBase, + sax_model: str, + model: str): return { "model": sax_model, "prompt": prompt, @@ -424,8 +565,13 @@ def create_request_payload(self, "max_tokens": output_len, "stream": False, } - def tokens_from_response(self, response: Dict): - return response["choices"][0]["text"] + def get_response_length( + self, + request_len: int, + response: Dict, + tokenizer: PreTrainedTokenizerBase): + output_token_ids = tokenizer(response["choices"][0]["text"]).input_ids + return len(output_token_ids) def init_errors_map() -> Dict[str, int]: errors = { @@ -792,8 +938,8 @@ def get_filtered_dataset( return filtered_dataset async def benchmark( - args: argparse.Namespace, - api_url: str, + args: argparse.Namespace, + backend: Backend, tokenizer: PreTrainedTokenizerBase, model: str, ) -> BenchmarkingReport: @@ -817,14 +963,13 @@ async def benchmark( "max_num_prompts": args.num_prompts, }] } - for index, step in enumerate(all_steps["steps"]): # No need to sleep before running the first step if 'time_between_steps' in args.job and index != 0: print(f"Sleeping for {args.job['time_between_steps']} sec...") await asyncio.sleep(args.job["time_between_steps"]) - max_prompts = f" {step['max_num_prompts']} requests" if 'max_num_prompts' in step else " " + max_prompts = f" {step['max_num_prompts']} requests" if 'max_num_prompts' in step else "" duration = f" {step['time']} sec" if 'time' in step else " " print(f"Starting benchmarking{max_prompts} at {step['rate']} requests/sec for{duration}") @@ -840,9 +985,8 @@ async def benchmark( prompt, prompt_len, output_len = request task = asyncio.create_task( - send_request( - args.backend, - api_url, + backend.send_request( + f"http://{args.host}:{args.port}", prompt, prompt_len, output_len, @@ -931,8 +1075,9 @@ async def main(args: argparse.Namespace): ) args.start_datetime = datetime.fromtimestamp(time.time_ns() / NS_IN_SEC) + backend: Backend = getBackend(args.backend) reports : List[BenchmarkingReport] = await asyncio.gather( - *[benchmark(args, api_url, tokenizer, model) for model in models] + *[benchmark(args, backend, tokenizer, model) for model in models] ) if args.save_aggregated_result: From 75f9acd7d4b406f6dcb8cc5fdf0b6779d3529b4d Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 12 Nov 2024 00:00:16 +0000 Subject: [PATCH 15/27] remove print --- .../tools/profile-generator/container/benchmark_serving.py | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 8cd98e956..910cf03d8 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -407,7 +407,6 @@ def get_response_length( request_len: int, response: Dict, tokenizer: PreTrainedTokenizerBase): - print(response) output_token_ids = tokenizer(response["choices"][0]["text"]).input_ids return len(output_token_ids) From 7208868f998089e24b319cc1d1e652b83ec68eca Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 12 Nov 2024 18:34:16 +0000 Subject: [PATCH 16/27] remove duplicate methods --- .../container/benchmark_serving.py | 803 +++++++----------- 1 file changed, 317 insertions(+), 486 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 910cf03d8..c503889e1 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -5,7 +5,7 @@ It currently supports TGI, vLLM, Triton TensorRT-LLM and Saxml. """ -from abc import ABC, abstractmethod, abstractproperty +from abc import ABC, abstractmethod import argparse import asyncio from datetime import datetime @@ -38,225 +38,6 @@ response_length_metric = Histogram("LatencyProfileGenerator:response_length", "Response length", buckets=[2**i for i in range(1, 16)]) tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request') -class BenchmarkConfig(TypedDict): - model: str - model_server: str - start_time: float - -class MetricSummary(TypedDict, total=False): - short_name: Optional[str] - name: str - description: str - mean: float - median: Optional[float] - sd: Optional[float] - min: Optional[float] - max: Optional[float] - p90: Optional[float] - p99: Optional[float] - -class BenchmarkingStepReport(TypedDict): - """Result for one step""" - request_rate: float - timestamp_start: float - timestamp_end: float - num_prompts_attempted: int - latencies: List - local_metrics: List[MetricSummary] - server_metrics: Optional[List[MetricSummary]] - errors: Dict[str, int] - -class BenchmarkingReport(): - """Results for all steps for a single model""" - args: argparse.Namespace - config: BenchmarkConfig - steps: List[BenchmarkingStepReport] - - def __init__(self, args : argparse.Namespace, model: str, start_time: float): - self.args = args - self.config = BenchmarkConfig( - model = model, - model_server = args.backend, - start_time = start_time - ) - self.steps = [] - - def record_metrics_for_step( - self, - request_rate: float, - timestamp_start: float, - timestamp_end: float, - num_prompts_attempted : int, - latencies: List, - errors: Dict[str, int], - ): - def get_metrics_to_scrape(backend: str) -> List[str]: - if backend == "vllm": - return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"] - elif backend == "jetstream": - return [ - "jetstream_slots_used_percentage", - "jetstream_prefill_backlog_size", - ] - else: - return [] - - def metric_sumamry_from_points(name: str, description: str, points : List[float], short_name: Optional[str] = None) -> MetricSummary: - mean = np.mean(points) if points else 0 - median = np.median(points) if points else 0 - sd = np.std(points) if points else 0 - min = np.min(points) if points else 0 - max = np.max(points) if points else 0 - p90 = np.percentile(points, 90) if points else 0 - p99 = np.percentile(points, 99) if points else 0 - - return MetricSummary( - short_name = short_name if short_name is not None else name, - name = name, - description = description, - mean = float(mean), - median = float(median), - sd = float(sd), - min = float(min), - max = float(max), - p90 = float(p90), - p99 = float(p99) - ) - - total_time = (timestamp_end - timestamp_start)/ NS_IN_SEC - if self.args.scrape_server_metrics: - server_metrics = fetch_metrics_from_gmp(get_metrics_to_scrape(self.args.backend), total_time, self.args.backend) - - self.steps.append(BenchmarkingStepReport( - request_rate = request_rate, - timestamp_start = timestamp_start, - timestamp_end = timestamp_end, - num_prompts_attempted = num_prompts_attempted, - latencies = latencies, - errors = errors, - local_metrics = [ - metric_sumamry_from_points( - name="per_token_latency", - description="seconds/token (includes waiting time on server)", - points=[latency / (prompt_len + output_len) for prompt_len, output_len, latency in latencies]), - metric_sumamry_from_points( - name="latency", - description="milliseconds/request (includes waiting time on server)" , - points=[1000 * latency for _, _, latency in latencies]), - metric_sumamry_from_points( - short_name="tpot", - name="per_output_token_latency", - description="milliseconds/output_token (includes waiting time on server)", - points=[1000 * latency / output_len for _, output_len, latency in latencies]), - metric_sumamry_from_points( - name="input_length", - description="input length", - points=[float(prompt_len) for prompt_len, _, _ in latencies]), - metric_sumamry_from_points( - name="output_length", - description="output length", - points=[float(output_len) for _, output_len, _ in latencies]), - MetricSummary( - name = "throughput", - description = "throughput", - mean = (len(latencies) / ((timestamp_end - timestamp_start) / NS_IN_SEC)), - ), - ], - server_metrics = server_metrics - )) - - # Each element in the output list is a report for each step - def to_text_reports(self, write_to_files: bool = False) -> List[str]: - output : Dict[str, str] = {} - required_stats = ["latency", "throughput", "input_length", "output_length", "per_output_token_latency"] - for step in self.steps: - if not all(required_stat in [metric['name'] for metric in step['local_metrics']] for required_stat in required_stats): - raise Exception(f"All of the following stats must be recorded: {required_stats}") - - for step in self.steps: - step_output : List[str] = [] - total_time = (step['timestamp_end'] - step['timestamp_start']) / NS_IN_SEC - total_output_tokens = np.sum([output_len for _, output_len, _ in step['latencies']]) - output_tokens_per_second = total_output_tokens / total_time - output_tokens_per_min = 60 * output_tokens_per_second - - total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in step['latencies']]) - input_tokens_per_min = 60 * total_input_tokens / total_time - - total_tokens = total_input_tokens + total_output_tokens - tokens_per_min = 60 * total_tokens / total_time - step_output.append(f"====Result for Model: {self.config['model']}====") - step_output.append(f"Errors: {step['errors']}") - step_output.append(f"Total time: {total_time:.2f} s") - step_output.append(f"Successful/total requests: {len(step['latencies'])}/{step['num_prompts_attempted']}") - step_output.append(f"Requests/min: {60 * step['num_prompts_attempted'] / total_time:.2f}") - step_output.append(f"Output_tokens/min: {output_tokens_per_min:.2f}") - step_output.append(f"Input_tokens/min: {input_tokens_per_min:.2f}") - step_output.append(f"Tokens/min: {tokens_per_min:.2f}") - - if self.args.machine_cost: - step_output.append( - f"Cost $/1k tokens: {self.args.machine_cost * 1000 / (60 * output_tokens_per_min)}" - ) - for metric in step['local_metrics']: - step_output.append(f"Average {metric['description']}:" f" {metric['mean']:.2f}") - output_filename = f"latency-profile-{datetime.fromtimestamp(step['timestamp_start'] / NS_IN_SEC).strftime('%Y-%m-%d_%H-%M-%S')}.txt" - output[output_filename] = '\n'.join(step_output) - if write_to_files: - with open(output_filename, 'w') as file: - file.write(output[output_filename]) - return list(output.values()) - - # The output is a a single json summary of all steps - def to_json_report(self, write_to_file: bool = False) -> Dict: - output = { - "config": { - "num_models": len(self.args.models) if self.args.save_aggregated_result else 1, - "start_time": { - "seconds" : self.steps[0]["timestamp_start"] // NS_IN_SEC, - "nanos" : self.steps[0]["timestamp_start"] % NS_IN_SEC, - }, - **self.config, - }, - "summary_stats": { - "stats": [ - { - "request_rate": step["request_rate"], - **{metric["short_name"]: metric for metric in step["local_metrics"] if "short_name" in metric}, - "model_server_metrics": [ - {"name": server_metric["name"], **server_metric} - for server_metric in step["server_metrics"] - ] if step["server_metrics"] is not None else [] - } - for step in self.steps - ] - }, - - # Legacy use case, use config if possible - "dimensions": { - "date": self.args.start_datetime.strftime('%Y%m%d-%H%M%S'), - "backend": self.args.backend, - "model_id": self.config['model'], - "tokenizer_id": self.args.tokenizer, - } if len(self.steps) == 1 else None, - # Legacy use case, use summary_stats if possible - "metrics" : { - # Traffic - "num_prompts_attempted": 0, - "num_prompts_succeeded": 0, - "request_rate": self.steps[0]['request_rate'], - } if len(self.steps) == 1 else None, - } - - if write_to_file: - model_without_slash = self.config['model'].replace("/","-") - file_name = ( - f"{self.args.file_prefix}-{self.args.backend}-{self.args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json" - ) - with open(file_name, "w", encoding="utf-8") as outfile: - json.dump(output, outfile) - return output - class Backend(ABC): """ An abstract base class for Backend that defines the interface @@ -572,6 +353,321 @@ def get_response_length( output_token_ids = tokenizer(response["choices"][0]["text"]).input_ids return len(output_token_ids) +class BenchmarkConfig(TypedDict): + model: str + model_server: str + start_time: float + +class MetricSummary(TypedDict, total=False): + short_name: Optional[str] + name: str + description: str + mean: float + median: Optional[float] + sd: Optional[float] + min: Optional[float] + max: Optional[float] + p90: Optional[float] + p99: Optional[float] + +class BenchmarkingStepReport(TypedDict): + """Result for one step""" + request_rate: float + timestamp_start: float + timestamp_end: float + num_prompts_attempted: int + latencies: List + local_metrics: List[MetricSummary] + server_metrics: Optional[List[MetricSummary]] + errors: Dict[str, int] + +class BenchmarkingReport(): + """Results for all steps for a single model""" + args: argparse.Namespace + config: BenchmarkConfig + steps: List[BenchmarkingStepReport] + + def __init__(self, args : argparse.Namespace, model: str, start_time: float): + self.args = args + self.config = BenchmarkConfig( + model = model, + model_server = args.backend, + start_time = start_time + ) + self.steps = [] + + def record_metrics_for_step( + self, + request_rate: float, + timestamp_start: float, + timestamp_end: float, + num_prompts_attempted : int, + latencies: List, + errors: Dict[str, int], + backend: Backend, + ): + + def fetch_metrics_from_gmp(backend: Backend, duration: float) -> List[MetricSummary]: + """Gets summaries for metrics queried from GMP, queries vary per model server""" + + # Creates a credentials object from the default service account file + # Assumes that script has appropriate default credentials set up, ref: + # https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials + credentials, project_id = google.auth.default() + # Prepare an authentication request - helps format the request auth token + auth_req = google.auth.transport.requests.Request() + + # Request refresh tokens + credentials.refresh(auth_req) + url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/metadata' % (project_id) + headers_api = {'Authorization': 'Bearer ' + credentials.token} + request_post = requests.get(url=url, headers=headers_api) + all_metrics_metadata = request_post.json() + if request_post.ok is not True: + print("HTTP Error: %s" % (all_metrics_metadata)) + return [] + if all_metrics_metadata["status"] != "success": + print("Metadata error response: %s" % all_metrics_metadata["error"]) + return [] + + metrics_list : List[MetricSummary] = [] + for metric in backend.get_server_metrics(): + print("Metric Name: %s" % (metric)) + + # Find metric type + metric_type = all_metrics_metadata['data'][metric] + if all_metrics_metadata['data'][metric] is None: + print("No metric found for: %s" % metric) + return [] + metric_type = metric_type[0]['type'] + + metric_results = {} + # Queries scrape all metrics collected from the last $DURATION seconds from the backend's related + # podmonitoring spec assumed to be named "$BACKEND-podmonitoring" + queries = { + "gauge": { + "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, self.args.backend, duration), + "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, self.args.backend, duration), + "Sd": "stddev_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, self.args.backend, duration), + "Min": "min_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, self.args.backend, duration), + "Max": "max_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, self.args.backend, duration), + "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, self.args.backend, duration), + "P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, self.args.backend, duration), + }, + "histogram": { + "Mean": "sum(rate(%s_sum{job='%s-podmonitoring'}[%.0fs])) / sum(rate(%s_count{job='%s-podmonitoring'}[%.0fs]))" % (metric, self.args.backend, duration, metric, self.args.backend, duration), + "Median": "histogram_quantile(0.5, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, self.args.backend, duration), + "Min": "histogram_quantile(0, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, self.args.backend, duration), + "Max": "histogram_quantile(1, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, self.args.backend, duration), + "P90": "histogram_quantile(0.9, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, self.args.backend, duration), + "P99": "histogram_quantile(0.99, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, self.args.backend, duration), + } + } + + metric_data : MetricSummary = { + "name": metric, + "description": f"Metrics for {metric} from {self.args.backend} backend", + } + for query_name, query in queries[metric_type].items(): + + # Configure respective query + url = f'https://monitoring.googleapis.com/v1/projects/{project_id}/location/global/prometheus/api/v1/query' + headers_api = {'Authorization': f'Bearer {credentials.token}'} + params = {'query': query} + + request_post = requests.get(url=url, headers=headers_api, params=params) + response = request_post.json() + + # handle response + if request_post.ok: + if response["status"] == "success": + metric_results[query_name] = float(response["data"]["result"][0]["value"][1]) + print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1])) + else: + print("Cloud Monitoring PromQL Error: %s" % (response["error"])) + else: + print("HTTP Error: %s" % (response)) + + # Handle response + if request_post.ok and response["status"] == "success": + result_value = float(response["data"]["result"][0]["value"][1]) + if query_name == "Mean": + metric_data["mean"] = result_value + elif query_name == "Median": + metric_data["median"] = result_value + elif query_name == "Sd": + metric_data["sd"] = result_value + elif query_name == "Min": + metric_data["min"] = result_value + elif query_name == "Max": + metric_data["max"] = result_value + elif query_name == "P90": + metric_data["p90"] = result_value + elif query_name == "P99": + metric_data["p99"] = result_value + else: + error_message = response.get("error", "HTTP Error") + print(f"Error fetching {query_name} for {metric}: {error_message}") + + metrics_list.append(metric_data) + return metrics_list + + def metric_sumamry_from_points(name: str, description: str, points : List[float], short_name: Optional[str] = None) -> MetricSummary: + mean = np.mean(points) if points else 0 + median = np.median(points) if points else 0 + sd = np.std(points) if points else 0 + min = np.min(points) if points else 0 + max = np.max(points) if points else 0 + p90 = np.percentile(points, 90) if points else 0 + p99 = np.percentile(points, 99) if points else 0 + + return MetricSummary( + short_name = short_name if short_name is not None else name, + name = name, + description = description, + mean = float(mean), + median = float(median), + sd = float(sd), + min = float(min), + max = float(max), + p90 = float(p90), + p99 = float(p99) + ) + + total_time = (timestamp_end - timestamp_start)/ NS_IN_SEC + if self.args.scrape_server_metrics: + server_metrics = fetch_metrics_from_gmp(backend, total_time) + + self.steps.append(BenchmarkingStepReport( + request_rate = request_rate, + timestamp_start = timestamp_start, + timestamp_end = timestamp_end, + num_prompts_attempted = num_prompts_attempted, + latencies = latencies, + errors = errors, + local_metrics = [ + metric_sumamry_from_points( + name="per_token_latency", + description="seconds/token (includes waiting time on server)", + points=[latency / (prompt_len + output_len) for prompt_len, output_len, latency in latencies]), + metric_sumamry_from_points( + name="latency", + description="milliseconds/request (includes waiting time on server)" , + points=[1000 * latency for _, _, latency in latencies]), + metric_sumamry_from_points( + short_name="tpot", + name="per_output_token_latency", + description="milliseconds/output_token (includes waiting time on server)", + points=[1000 * latency / output_len for _, output_len, latency in latencies]), + metric_sumamry_from_points( + name="input_length", + description="input length", + points=[float(prompt_len) for prompt_len, _, _ in latencies]), + metric_sumamry_from_points( + name="output_length", + description="output length", + points=[float(output_len) for _, output_len, _ in latencies]), + MetricSummary( + name = "throughput", + description = "throughput", + mean = (len(latencies) / ((timestamp_end - timestamp_start) / NS_IN_SEC)), + ), + ], + server_metrics = server_metrics + )) + + # Each element in the output list is a report for each step + def to_text_reports(self, write_to_files: bool = False) -> List[str]: + output : Dict[str, str] = {} + required_stats = ["latency", "throughput", "input_length", "output_length", "per_output_token_latency"] + for step in self.steps: + if not all(required_stat in [metric['name'] for metric in step['local_metrics']] for required_stat in required_stats): + raise Exception(f"All of the following stats must be recorded: {required_stats}") + + for step in self.steps: + step_output : List[str] = [] + total_time = (step['timestamp_end'] - step['timestamp_start']) / NS_IN_SEC + total_output_tokens = np.sum([output_len for _, output_len, _ in step['latencies']]) + output_tokens_per_second = total_output_tokens / total_time + output_tokens_per_min = 60 * output_tokens_per_second + + total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in step['latencies']]) + input_tokens_per_min = 60 * total_input_tokens / total_time + + total_tokens = total_input_tokens + total_output_tokens + tokens_per_min = 60 * total_tokens / total_time + step_output.append(f"====Result for Model: {self.config['model']}====") + step_output.append(f"Errors: {step['errors']}") + step_output.append(f"Total time: {total_time:.2f} s") + step_output.append(f"Successful/total requests: {len(step['latencies'])}/{step['num_prompts_attempted']}") + step_output.append(f"Requests/min: {60 * step['num_prompts_attempted'] / total_time:.2f}") + step_output.append(f"Output_tokens/min: {output_tokens_per_min:.2f}") + step_output.append(f"Input_tokens/min: {input_tokens_per_min:.2f}") + step_output.append(f"Tokens/min: {tokens_per_min:.2f}") + + if self.args.machine_cost: + step_output.append( + f"Cost $/1k tokens: {self.args.machine_cost * 1000 / (60 * output_tokens_per_min)}" + ) + for metric in step['local_metrics']: + step_output.append(f"Average {metric['description']}:" f" {metric['mean']:.2f}") + output_filename = f"latency-profile-{datetime.fromtimestamp(step['timestamp_start'] / NS_IN_SEC).strftime('%Y-%m-%d_%H-%M-%S')}.txt" + output[output_filename] = '\n'.join(step_output) + if write_to_files: + with open(output_filename, 'w') as file: + file.write(output[output_filename]) + return list(output.values()) + + # The output is a a single json summary of all steps + def to_json_report(self, write_to_file: bool = False) -> Dict: + output = { + "config": { + "num_models": len(self.args.models) if self.args.save_aggregated_result else 1, + "start_time": { + "seconds" : self.steps[0]["timestamp_start"] // NS_IN_SEC, + "nanos" : self.steps[0]["timestamp_start"] % NS_IN_SEC, + }, + **self.config, + }, + "summary_stats": { + "stats": [ + { + "request_rate": step["request_rate"], + **{metric["short_name"]: metric for metric in step["local_metrics"] if "short_name" in metric}, + "model_server_metrics": [ + {"name": server_metric["name"], **server_metric} + for server_metric in step["server_metrics"] + ] if step["server_metrics"] is not None else [] + } + for step in self.steps + ] + }, + + # Legacy use case, use config if possible + "dimensions": { + "date": self.args.start_datetime.strftime('%Y%m%d-%H%M%S'), + "backend": self.args.backend, + "model_id": self.config['model'], + "tokenizer_id": self.args.tokenizer, + } if len(self.steps) == 1 else None, + # Legacy use case, use summary_stats if possible + "metrics" : { + # Traffic + "num_prompts_attempted": 0, + "num_prompts_succeeded": 0, + "request_rate": self.steps[0]['request_rate'], + } if len(self.steps) == 1 else None, + } + + if write_to_file: + model_without_slash = self.config['model'].replace("/","-") + file_name = ( + f"{self.args.file_prefix}-{self.args.backend}-{self.args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json" + ) + with open(file_name, "w", encoding="utf-8") as outfile: + json.dump(output, outfile) + return output + def init_errors_map() -> Dict[str, int]: errors = { "ClientConnectorError": 0, @@ -599,112 +695,6 @@ def getBackend(backend: str) -> Backend: else: raise ValueError("Unsupported backend") - -def fetch_metrics_from_gmp(metrics: List[str], duration: float, backend: str) -> List[MetricSummary]: - """Gets summaries for metrics queried from GMP, queries vary per model server""" - - # Creates a credentials object from the default service account file - # Assumes that script has appropriate default credentials set up, ref: - # https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials - credentials, project_id = google.auth.default() - # Prepare an authentication request - helps format the request auth token - auth_req = google.auth.transport.requests.Request() - - # Request refresh tokens - credentials.refresh(auth_req) - url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/metadata' % (project_id) - headers_api = {'Authorization': 'Bearer ' + credentials.token} - request_post = requests.get(url=url, headers=headers_api) - all_metrics_metadata = request_post.json() - if request_post.ok is not True: - print("HTTP Error: %s" % (all_metrics_metadata)) - return [] - if all_metrics_metadata["status"] != "success": - print("Metadata error response: %s" % all_metrics_metadata["error"]) - return [] - - metrics_list : List[MetricSummary] = [] - for metric in metrics: - print("Metric Name: %s" % (metric)) - - # Find metric type - metric_type = all_metrics_metadata['data'][metric] - if all_metrics_metadata['data'][metric] is None: - print("No metric found for: %s" % metric) - return [] - metric_type = metric_type[0]['type'] - - metric_results = {} - # Queries scrape all metrics collected from the last $DURATION seconds from the backend's related - # podmonitoring spec assumed to be named "$BACKEND-podmonitoring" - queries = { - "gauge": { - "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), - "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), - "Sd": "stddev_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), - "Min": "min_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), - "Max": "max_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), - "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), - "P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), - }, - "histogram": { - "Mean": "sum(rate(%s_sum{job='%s-podmonitoring'}[%.0fs])) / sum(rate(%s_count{job='%s-podmonitoring'}[%.0fs]))" % (metric, backend, duration, metric, backend, duration), - "Median": "histogram_quantile(0.5, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration), - "Min": "histogram_quantile(0, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration), - "Max": "histogram_quantile(1, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration), - "P90": "histogram_quantile(0.9, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration), - "P99": "histogram_quantile(0.99, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration), - } - } - - metric_data : MetricSummary = { - "name": metric, - "description": f"Metrics for {metric} from {backend} backend", - } - for query_name, query in queries[metric_type].items(): - - # Configure respective query - url = f'https://monitoring.googleapis.com/v1/projects/{project_id}/location/global/prometheus/api/v1/query' - headers_api = {'Authorization': f'Bearer {credentials.token}'} - params = {'query': query} - - request_post = requests.get(url=url, headers=headers_api, params=params) - response = request_post.json() - - # handle response - if request_post.ok: - if response["status"] == "success": - metric_results[query_name] = float(response["data"]["result"][0]["value"][1]) - print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1])) - else: - print("Cloud Monitoring PromQL Error: %s" % (response["error"])) - else: - print("HTTP Error: %s" % (response)) - - # Handle response - if request_post.ok and response["status"] == "success": - result_value = float(response["data"]["result"][0]["value"][1]) - if query_name == "Mean": - metric_data["mean"] = result_value - elif query_name == "Median": - metric_data["median"] = result_value - elif query_name == "Sd": - metric_data["sd"] = result_value - elif query_name == "Min": - metric_data["min"] = result_value - elif query_name == "Max": - metric_data["max"] = result_value - elif query_name == "P90": - metric_data["p90"] = result_value - elif query_name == "P99": - metric_data["p99"] = result_value - else: - error_message = response.get("error", "HTTP Error") - print(f"Error fetching {query_name} for {metric}: {error_message}") - - metrics_list.append(metric_data) - return metrics_list - async def generate_next_request( input_requests: List[Tuple[str, int, int]], request_rate_expr: str, @@ -729,159 +719,6 @@ async def generate_next_request( # The next request will be sent after the interval. await asyncio.sleep(interval) -async def send_request( - backend: str, - api_url: str, - prompt: str, - prompt_len: int, - output_len: int, - best_of: int, - use_beam_search: bool, - top_k: int, - tokenizer: PreTrainedTokenizerBase, - sax_model: str, - model: str, -) -> Tuple[Optional[Tuple[int, int, float]], Optional[Dict[str, int]]]: - """Sends request to server.""" - request_start_time = time.time() - errors = init_errors_map() - - headers = {"User-Agent": "Benchmark Client"} - if backend == "vllm": - pload = { - "model": model, - "prompt": prompt, - "n": 1, - "best_of": best_of, - "use_beam_search": use_beam_search, - "temperature": 0.0 if use_beam_search else 1.0, - "top_p": 1.0, - "max_tokens": output_len, - "ignore_eos": False, - "stream": False, - } - elif backend == "tgi": - params = { - "best_of": best_of, - "max_new_tokens": output_len, - "do_sample": True, - } - pload = { - "inputs": prompt, - "parameters": params, - } - elif backend == "naive_transformers": - # If max_length or top_k is not specified _MAX_LENGTH_DEFAULT = 200 and - # _TOP_K_DEFAULT = 10 in peft/handler.py will be used. - pload = { - "instances": [{ - "prompt": prompt, - "max_length": output_len, - "top_k": top_k, - }] - } - elif backend == "tensorrt_llm_triton": - pload = { - "text_input": prompt, - "max_tokens": output_len, - "beam_width": 1 if not use_beam_search else best_of, - "temperature": 0.0 if use_beam_search else 1.0, - "top_p": 1.0, - "bad_words": "", - "stop_words": "", - "stream": False, - } - elif backend == "sax": - pload = { - "model": sax_model, - "prompt": prompt, - "n": 1, - "best_of": best_of, - "use_beam_search": use_beam_search, - "temperature": 0.0 if use_beam_search else 1.0, - "top_p": 1.0, - "top_k": 50, - "max_tokens": output_len, - "stream": False, - } - elif backend == "jetstream": - pload = { - "prompt": prompt, - "max_tokens": output_len, - } - else: - raise ValueError(f"Unknown backend: {backend}") - - # Set client timeout to be 3 hrs. - timeout = aiohttp.ClientTimeout(total=CLIENT_TIMEOUT_SEC) - async with aiohttp.ClientSession(timeout=timeout,trust_env=True) as session: - while True: - try: - async with session.post(api_url, headers=headers, json=pload, ssl=False) as response: - output = await response.json() - - # Re-send the request if it failed. - if "error" not in output: - break - except aiohttp.client_exceptions.ClientConnectorError as client_err: - errors["ClientConnectorError"] += 1 - print(f"ClientConnectorError: {client_err}") - return None, errors - except asyncio.TimeoutError as timeout_err: - errors["TimeoutError"] += 1 - print(f"TimeoutError: {timeout_err}") - return None, errors - except aiohttp.client_exceptions.ClientOSError as e: - errors["ClientOSError"] += 1 - print(f"ClientOSError: {e}") - return None, errors - except aiohttp.client_exceptions.ContentTypeError as e: - print(f"ContentTypeError: {e}, response: {response}") - errors["ContentTypeError"] += 1 - return None, errors - except aiohttp.client_exceptions.ServerDisconnectedError as e: - errors["ServerDisconnectedError"] += 1 - print(f"ServerDisconnectedError: {e}") - return None, errors - except Exception as e: - print(f"Unknown error {e}") - errors["unknown_error"] += 1 - return None, errors - - request_end_time = time.time() - # Naive HF transformers generation and TensorRT-LLM generation stops at EOS - # tokens and the generation may be shorter than the ground-truth output - # sequence length. - if backend == "naive_transformers": - complete_pred = output["predictions"][0][0]["generated_text"] - new_text_start_index = complete_pred.find(NEW_TEXT_KEY) + len(NEW_TEXT_KEY) - pred = complete_pred[new_text_start_index:] - output_token_ids = tokenizer(pred).input_ids - output_len = len(output_token_ids) - prompt_len - elif backend == "tensorrt_llm_triton": - output_token_ids = tokenizer(output["text_output"]).input_ids - output_len = len(output_token_ids) - elif backend == "sax": - output_token_ids = tokenizer(output["choices"][0]["text"]).input_ids - output_len = len(output_token_ids) - elif backend == "tgi": - output_token_ids = tokenizer(output["generated_text"]).input_ids - output_len = len(output_token_ids) - elif backend == "vllm": - output_token_ids = tokenizer(output["choices"][0]["text"]).input_ids - output_len = len(output_token_ids) - elif backend == "jetstream": - output_token_ids = tokenizer(output["response"]).input_ids - output_len = len(output_token_ids) - - # (prompt len, output len, latency, success) - request_latency = (prompt_len, output_len, (request_end_time - request_start_time)) - tpot_metric.observe((request_end_time - request_start_time) / output_len) - prompt_length_metric.observe(prompt_len) - response_length_metric.observe(output_len) - - return request_latency, None - def get_filtered_dataset( dataset_path: str, max_input_len: int, @@ -1013,7 +850,7 @@ async def benchmark( if errors: for err, count in errors.items(): all_errors[err] = all_errors[err] + count - benchmark_results.record_metrics_for_step(step['rate'], step_start_timestamp, step_end_timestamp, prompts_sent_this_step, all_latencies, all_errors) + benchmark_results.record_metrics_for_step(step['rate'], step_start_timestamp, step_end_timestamp, prompts_sent_this_step, all_latencies, all_errors, backend) print(f"Completed all steps, generating reports...") return benchmark_results @@ -1059,16 +896,10 @@ async def main(args: argparse.Namespace): print(f"Models to benchmark: {models}") random.seed(args.seed) np.random.seed(args.seed) - endpoint = ( - "v1/completions" - if args.backend == "vllm" - else args.endpoint - ) print(f"Starting Prometheus Server on port {PROMETHEUS_PORT}") start_http_server(PROMETHEUS_PORT) - api_url = f"http://{args.host}:{args.port}/{endpoint}" tokenizer = AutoTokenizer.from_pretrained( args.tokenizer, trust_remote_code=args.trust_remote_code ) From 8517fd9a0a8a94673c3d8897a7cebf82c3f3cfde Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 12 Nov 2024 23:20:17 +0000 Subject: [PATCH 17/27] changes to json report --- .../container/benchmark_serving.py | 45 +++++++++++-------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index c503889e1..ce6c44ecd 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -359,7 +359,7 @@ class BenchmarkConfig(TypedDict): start_time: float class MetricSummary(TypedDict, total=False): - short_name: Optional[str] + json_field_name: Optional[str] name: str description: str mean: float @@ -512,7 +512,7 @@ def fetch_metrics_from_gmp(backend: Backend, duration: float) -> List[MetricSumm metrics_list.append(metric_data) return metrics_list - def metric_sumamry_from_points(name: str, description: str, points : List[float], short_name: Optional[str] = None) -> MetricSummary: + def metric_sumamry_from_points(name: str, description: str, points : List[float], json_field_name: Optional[str] = None) -> MetricSummary: mean = np.mean(points) if points else 0 median = np.median(points) if points else 0 sd = np.std(points) if points else 0 @@ -522,7 +522,7 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float] p99 = np.percentile(points, 99) if points else 0 return MetricSummary( - short_name = short_name if short_name is not None else name, + json_field_name = json_field_name if json_field_name is not None else name, name = name, description = description, mean = float(mean), @@ -551,21 +551,22 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float] description="seconds/token (includes waiting time on server)", points=[latency / (prompt_len + output_len) for prompt_len, output_len, latency in latencies]), metric_sumamry_from_points( - name="latency", + json_field_name="request_latency", + name="latency", description="milliseconds/request (includes waiting time on server)" , points=[1000 * latency for _, _, latency in latencies]), metric_sumamry_from_points( - short_name="tpot", + json_field_name="tpot", name="per_output_token_latency", description="milliseconds/output_token (includes waiting time on server)", points=[1000 * latency / output_len for _, output_len, latency in latencies]), metric_sumamry_from_points( name="input_length", - description="input length", + description="length of prompt", points=[float(prompt_len) for prompt_len, _, _ in latencies]), metric_sumamry_from_points( name="output_length", - description="output length", + description="length of response", points=[float(output_len) for _, output_len, _ in latencies]), MetricSummary( name = "throughput", @@ -622,18 +623,18 @@ def to_text_reports(self, write_to_files: bool = False) -> List[str]: def to_json_report(self, write_to_file: bool = False) -> Dict: output = { "config": { + **self.config, "num_models": len(self.args.models) if self.args.save_aggregated_result else 1, "start_time": { "seconds" : self.steps[0]["timestamp_start"] // NS_IN_SEC, "nanos" : self.steps[0]["timestamp_start"] % NS_IN_SEC, }, - **self.config, }, "summary_stats": { "stats": [ { "request_rate": step["request_rate"], - **{metric["short_name"]: metric for metric in step["local_metrics"] if "short_name" in metric}, + **{metric["json_field_name"]: metric for metric in step["local_metrics"] if "json_field_name" in metric}, "model_server_metrics": [ {"name": server_metric["name"], **server_metric} for server_metric in step["server_metrics"] @@ -649,14 +650,22 @@ def to_json_report(self, write_to_file: bool = False) -> Dict: "backend": self.args.backend, "model_id": self.config['model'], "tokenizer_id": self.args.tokenizer, - } if len(self.steps) == 1 else None, + } if len(self.args.models.split(',')) == 1 else None, # Legacy use case, use summary_stats if possible - "metrics" : { - # Traffic - "num_prompts_attempted": 0, - "num_prompts_succeeded": 0, - "request_rate": self.steps[0]['request_rate'], - } if len(self.steps) == 1 else None, + "metrics": { + # Traffic metrics + "num_prompts_attempted": self.steps[0]['num_prompts_attempted'], + "num_prompts_succeeded": self.steps[0]['latencies'], + "request_rate": self.steps[0]['request_rate'], + + **{ + f"{stat}_{metric['name']}": value + for metric in self.steps[0]["local_metrics"] + if "json_field_name" in metric + for stat, value in metric.items() + if stat not in ["name", "description", "json_field_name"] and value is not None + } + } if len(self.steps) == 1 else None } if write_to_file: @@ -679,7 +688,7 @@ def init_errors_map() -> Dict[str, int]: } return errors -def getBackend(backend: str) -> Backend: +def get_backend(backend: str) -> Backend: if backend == "vllm": return vLLMBackend() elif backend == "tgi": @@ -905,7 +914,7 @@ async def main(args: argparse.Namespace): ) args.start_datetime = datetime.fromtimestamp(time.time_ns() / NS_IN_SEC) - backend: Backend = getBackend(args.backend) + backend: Backend = get_backend(args.backend) reports : List[BenchmarkingReport] = await asyncio.gather( *[benchmark(args, backend, tokenizer, model) for model in models] ) From 2cb77b72c27fad8451b5da7156c0e10c63becfe5 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 12 Nov 2024 23:30:58 +0000 Subject: [PATCH 18/27] nit --- .../tools/profile-generator/container/benchmark_serving.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index ce6c44ecd..6e796dae8 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -570,7 +570,7 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float] points=[float(output_len) for _, output_len, _ in latencies]), MetricSummary( name = "throughput", - description = "throughput", + description = "throughput in requests per second", mean = (len(latencies) / ((timestamp_end - timestamp_start) / NS_IN_SEC)), ), ], @@ -655,7 +655,7 @@ def to_json_report(self, write_to_file: bool = False) -> Dict: "metrics": { # Traffic metrics "num_prompts_attempted": self.steps[0]['num_prompts_attempted'], - "num_prompts_succeeded": self.steps[0]['latencies'], + "num_prompts_succeeded": len(self.steps[0]['latencies']), "request_rate": self.steps[0]['request_rate'], **{ @@ -663,7 +663,6 @@ def to_json_report(self, write_to_file: bool = False) -> Dict: for metric in self.steps[0]["local_metrics"] if "json_field_name" in metric for stat, value in metric.items() - if stat not in ["name", "description", "json_field_name"] and value is not None } } if len(self.steps) == 1 else None } From 2aa34bf9bc9ae92f2c900beae6e172abdedbe629 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 12 Nov 2024 23:33:37 +0000 Subject: [PATCH 19/27] revert --- .../tools/profile-generator/container/benchmark_serving.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 6e796dae8..80daeb2b4 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -663,6 +663,7 @@ def to_json_report(self, write_to_file: bool = False) -> Dict: for metric in self.steps[0]["local_metrics"] if "json_field_name" in metric for stat, value in metric.items() + if stat not in ["name", "description", "json_field_name"] and value is not None } } if len(self.steps) == 1 else None } From b02e1090fade5540282439d108cdf966272a6ddb Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 12 Nov 2024 23:46:32 +0000 Subject: [PATCH 20/27] missing server_metrics in metrics --- .../container/benchmark_serving.py | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 80daeb2b4..b843e7001 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -360,15 +360,15 @@ class BenchmarkConfig(TypedDict): class MetricSummary(TypedDict, total=False): json_field_name: Optional[str] - name: str - description: str - mean: float - median: Optional[float] - sd: Optional[float] - min: Optional[float] - max: Optional[float] - p90: Optional[float] - p99: Optional[float] + name: str + description: str + mean: float + median: Optional[float] + sd: Optional[float] + min: Optional[float] + max: Optional[float] + p90: Optional[float] + p99: Optional[float] class BenchmarkingStepReport(TypedDict): """Result for one step""" @@ -621,6 +621,7 @@ def to_text_reports(self, write_to_files: bool = False) -> List[str]: # The output is a a single json summary of all steps def to_json_report(self, write_to_file: bool = False) -> Dict: + print(self.steps[0]["local_metrics"]) output = { "config": { **self.config, @@ -634,7 +635,7 @@ def to_json_report(self, write_to_file: bool = False) -> Dict: "stats": [ { "request_rate": step["request_rate"], - **{metric["json_field_name"]: metric for metric in step["local_metrics"] if "json_field_name" in metric}, + **{(metric["json_field_name"] if "json_field_name" in metric else metric["name"]): metric for metric in step["local_metrics"]}, "model_server_metrics": [ {"name": server_metric["name"], **server_metric} for server_metric in step["server_metrics"] @@ -657,14 +658,17 @@ def to_json_report(self, write_to_file: bool = False) -> Dict: "num_prompts_attempted": self.steps[0]['num_prompts_attempted'], "num_prompts_succeeded": len(self.steps[0]['latencies']), "request_rate": self.steps[0]['request_rate'], - **{ f"{stat}_{metric['name']}": value for metric in self.steps[0]["local_metrics"] if "json_field_name" in metric for stat, value in metric.items() if stat not in ["name", "description", "json_field_name"] and value is not None - } + }, + "server_metrics": [ + {"name": server_metric["name"], **server_metric} + for server_metric in step["server_metrics"] + ] if self.steps[0]["server_metrics"] is not None else [] } if len(self.steps) == 1 else None } From 4f7af86bd230ed322db8f0cdfe18885c8b6e7b68 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 12 Nov 2024 23:47:55 +0000 Subject: [PATCH 21/27] nit --- .../tools/profile-generator/container/benchmark_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index b843e7001..261ddb102 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -667,7 +667,7 @@ def to_json_report(self, write_to_file: bool = False) -> Dict: }, "server_metrics": [ {"name": server_metric["name"], **server_metric} - for server_metric in step["server_metrics"] + for server_metric in self.steps[0]["server_metrics"] ] if self.steps[0]["server_metrics"] is not None else [] } if len(self.steps) == 1 else None } From 1838f44d7f388b6adbb4cf2524c8dcdc2da7ce77 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 12 Nov 2024 23:50:26 +0000 Subject: [PATCH 22/27] remove prints --- .../tools/profile-generator/container/benchmark_serving.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 261ddb102..4a4e9bdf7 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -621,7 +621,6 @@ def to_text_reports(self, write_to_files: bool = False) -> List[str]: # The output is a a single json summary of all steps def to_json_report(self, write_to_file: bool = False) -> Dict: - print(self.steps[0]["local_metrics"]) output = { "config": { **self.config, @@ -1056,7 +1055,6 @@ def parse_request_rates(input_str): if os.path.isfile(input_str): with open(input_str, 'r') as file: input_str = file.read() - print(input_str) try: # Parse the input string as JSON request_data = json.loads(input_str) From 3ba738b59bbafa25b4d07038aa49e4d464941be7 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 12 Nov 2024 23:52:44 +0000 Subject: [PATCH 23/27] tweak fields --- .../tools/profile-generator/container/benchmark_serving.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 4a4e9bdf7..161c303ad 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -654,8 +654,7 @@ def to_json_report(self, write_to_file: bool = False) -> Dict: # Legacy use case, use summary_stats if possible "metrics": { # Traffic metrics - "num_prompts_attempted": self.steps[0]['num_prompts_attempted'], - "num_prompts_succeeded": len(self.steps[0]['latencies']), + "num_prompts": self.steps[0]['num_prompts_attempted'], "request_rate": self.steps[0]['request_rate'], **{ f"{stat}_{metric['name']}": value From 98785c0bd739c74fc0181a9592b664a24d4cdc17 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Wed, 13 Nov 2024 17:55:11 +0000 Subject: [PATCH 24/27] correct json output --- .../container/benchmark_serving.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 161c303ad..c5ec11718 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -634,7 +634,13 @@ def to_json_report(self, write_to_file: bool = False) -> Dict: "stats": [ { "request_rate": step["request_rate"], - **{(metric["json_field_name"] if "json_field_name" in metric else metric["name"]): metric for metric in step["local_metrics"]}, + **{(metric["json_field_name"] if "json_field_name" in metric else metric["name"]): { + stat: value + for stat, value in metric.items() + if stat not in ["name", "description", "json_field_name"] and value is not None + } + for metric in step["local_metrics"] + }, "model_server_metrics": [ {"name": server_metric["name"], **server_metric} for server_metric in step["server_metrics"] @@ -656,17 +662,20 @@ def to_json_report(self, write_to_file: bool = False) -> Dict: # Traffic metrics "num_prompts": self.steps[0]['num_prompts_attempted'], "request_rate": self.steps[0]['request_rate'], + "benchmark_time": (self.steps[0]['timestamp_end'] - self.steps[0]['timestamp_start']) / NS_IN_SEC, + "throughput_rps": (len(self.steps[0]['latencies']) / ((self.steps[0]['timestamp_end'] - self.steps[0]['timestamp_start']) / NS_IN_SEC)), + "throughput": np.sum([output_len for _, output_len, _ in self.steps[0]['latencies']]) / ((self.steps[0]['timestamp_end'] - self.steps[0]['timestamp_start']) / NS_IN_SEC), **{ - f"{stat}_{metric['name']}": value + f"{'avg' if stat == 'mean' else stat}_{metric['name']}": value for metric in self.steps[0]["local_metrics"] if "json_field_name" in metric for stat, value in metric.items() if stat not in ["name", "description", "json_field_name"] and value is not None }, - "server_metrics": [ - {"name": server_metric["name"], **server_metric} - for server_metric in self.steps[0]["server_metrics"] - ] if self.steps[0]["server_metrics"] is not None else [] + "server_metrics": { + server_metric["name"]: {k.capitalize(): v for k, v in server_metric.items() if k != "name"} + for server_metric in self.steps[0]["server_metrics"] + } if self.steps[0]["server_metrics"] is not None else {} } if len(self.steps) == 1 else None } From be0a89e91abea67f7cd0e98fc846da89689f6120 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Wed, 13 Nov 2024 19:01:14 +0000 Subject: [PATCH 25/27] to_dict --- .../container/benchmark_serving.py | 116 ++++++++++-------- 1 file changed, 62 insertions(+), 54 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index c5ec11718..e0989b097 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -37,7 +37,55 @@ prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)]) response_length_metric = Histogram("LatencyProfileGenerator:response_length", "Response length", buckets=[2**i for i in range(1, 16)]) tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request') + +class ErrorsReport(): + ClientConnectorErrors: int + TimeoutErrors: int + ContentTypeErrors: int + ClientOSErrors: int + ServerDisconnectedErrors: int + unknown_errors: int + + def __init__(self): + self.ClientConnectorErrors = 0 + self.TimeoutErrors = 0 + self.ContentTypeErrors = 0 + self.ClientOSErrors = 0 + self.ServerDisconnectedErrors = 0 + self.unknown_errors = 0 + + def to_dict(self) -> dict: + return {k: v for k, v in self.__dict__.items() if isinstance(v, int)} + + def record_error(self, error: Exception): + if isinstance(error, aiohttp.client_exceptions.ClientConnectorError): + self.ClientConnectorErrors += 1 + print(f"ClientConnectorError: {error}") + elif isinstance(error, asyncio.TimeoutError): + self.TimeoutErrors += 1 + print(f"TimeoutError: {error}") + elif isinstance(error, aiohttp.client_exceptions.ContentTypeError): + self.ContentTypeErrors += 1 + print(f"ContentTypeError: {error}") + elif isinstance(error, aiohttp.client_exceptions.ClientOSError): + self.ClientOSErrors += 1 + print(f"ClientOSError: {error}") + elif isinstance(error, aiohttp.client_exceptions.ServerDisconnectedError): + self.ServerDisconnectedErrors += 1 + print(f"ServerDisconnectedError: {error}") + else: + self.unknown_errors += 1 + print(f"Unknown error: {error}") + + def append_report(self, report: "ErrorsReport"): + self.ClientConnectorErrors += report.ClientConnectorErrors + self.TimeoutErrors += report.TimeoutErrors + self.ContentTypeErrors += report.ContentTypeErrors + self.ClientOSErrors += report.ClientOSErrors + self.ServerDisconnectedErrors += report.ServerDisconnectedErrors + self.unknown_errors += report.unknown_errors + class Backend(ABC): """ An abstract base class for Backend that defines the interface @@ -56,10 +104,10 @@ async def send_request( tokenizer: PreTrainedTokenizerBase, sax_model: str, model: str, - ) -> Tuple[Optional[Tuple[int, int, float]], Optional[Dict[str, int]]]: + ) -> Tuple[Optional[Tuple[int, int, float]], Optional[ErrorsReport]]: """Sends request to server.""" request_start_time = time.time() - errors = init_errors_map() + errors = ErrorsReport() headers = {"User-Agent": "Benchmark Client"} pload = self.create_request_payload( @@ -85,29 +133,8 @@ async def send_request( # Re-send the request if it failed. if "error" not in output: break - except aiohttp.client_exceptions.ClientConnectorError as client_err: - errors["ClientConnectorError"] += 1 - print(f"ClientConnectorError: {client_err}") - return None, errors - except asyncio.TimeoutError as timeout_err: - errors["TimeoutError"] += 1 - print(f"TimeoutError: {timeout_err}") - return None, errors - except aiohttp.client_exceptions.ClientOSError as e: - errors["ClientOSError"] += 1 - print(f"ClientOSError: {e}") - return None, errors - except aiohttp.client_exceptions.ContentTypeError as e: - print(f"ContentTypeError: {e}, response: {response}") - errors["ContentTypeError"] += 1 - return None, errors - except aiohttp.client_exceptions.ServerDisconnectedError as e: - errors["ServerDisconnectedError"] += 1 - print(f"ServerDisconnectedError: {e}") - return None, errors - except Exception as e: - print(f"Unknown error {e}") - errors["unknown_error"] += 1 + except Exception as e: + errors.record_error(e) return None, errors request_end_time = time.time() # Naive HF transformers generation and TensorRT-LLM generation stops at EOS @@ -379,7 +406,7 @@ class BenchmarkingStepReport(TypedDict): latencies: List local_metrics: List[MetricSummary] server_metrics: Optional[List[MetricSummary]] - errors: Dict[str, int] + errors: ErrorsReport class BenchmarkingReport(): """Results for all steps for a single model""" @@ -403,7 +430,7 @@ def record_metrics_for_step( timestamp_end: float, num_prompts_attempted : int, latencies: List, - errors: Dict[str, int], + errors: ErrorsReport, backend: Backend, ): @@ -432,7 +459,6 @@ def fetch_metrics_from_gmp(backend: Backend, duration: float) -> List[MetricSumm metrics_list : List[MetricSummary] = [] for metric in backend.get_server_metrics(): - print("Metric Name: %s" % (metric)) # Find metric type metric_type = all_metrics_metadata['data'][metric] @@ -482,7 +508,6 @@ def fetch_metrics_from_gmp(backend: Backend, duration: float) -> List[MetricSumm if request_post.ok: if response["status"] == "success": metric_results[query_name] = float(response["data"]["result"][0]["value"][1]) - print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1])) else: print("Cloud Monitoring PromQL Error: %s" % (response["error"])) else: @@ -577,8 +602,9 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float] server_metrics = server_metrics )) - # Each element in the output list is a report for each step def to_text_reports(self, write_to_files: bool = False) -> List[str]: + """Each element in the output list is a report for each step""" + output : Dict[str, str] = {} required_stats = ["latency", "throughput", "input_length", "output_length", "per_output_token_latency"] for step in self.steps: @@ -598,7 +624,7 @@ def to_text_reports(self, write_to_files: bool = False) -> List[str]: total_tokens = total_input_tokens + total_output_tokens tokens_per_min = 60 * total_tokens / total_time step_output.append(f"====Result for Model: {self.config['model']}====") - step_output.append(f"Errors: {step['errors']}") + step_output.append(f"Errors: {step['errors'].to_dict()}") step_output.append(f"Total time: {total_time:.2f} s") step_output.append(f"Successful/total requests: {len(step['latencies'])}/{step['num_prompts_attempted']}") step_output.append(f"Requests/min: {60 * step['num_prompts_attempted'] / total_time:.2f}") @@ -688,17 +714,6 @@ def to_json_report(self, write_to_file: bool = False) -> Dict: json.dump(output, outfile) return output -def init_errors_map() -> Dict[str, int]: - errors = { - "ClientConnectorError": 0, - "TimeoutError": 0, - "ContentTypeError": 0, - "ClientOSError": 0, - "ServerDisconnectedError": 0, - "unknown_error": 0, - } - return errors - def get_backend(backend: str) -> Backend: if backend == "vllm": return vLLMBackend() @@ -822,7 +837,7 @@ async def benchmark( for index, step in enumerate(all_steps["steps"]): # No need to sleep before running the first step - if 'time_between_steps' in args.job and index != 0: + if args.job is not None and 'time_between_steps' in args.job and index != 0: print(f"Sleeping for {args.job['time_between_steps']} sec...") await asyncio.sleep(args.job["time_between_steps"]) max_prompts = f" {step['max_num_prompts']} requests" if 'max_num_prompts' in step else "" @@ -863,13 +878,13 @@ async def benchmark( print(f"Finished benchmarking step {index + 1}") all_latencies = [] - all_errors = init_errors_map() + all_errors = ErrorsReport() for latency, errors in results: if latency: all_latencies.append(latency) if errors: for err, count in errors.items(): - all_errors[err] = all_errors[err] + count + all_errors.record_error(err) benchmark_results.record_metrics_for_step(step['rate'], step_start_timestamp, step_end_timestamp, prompts_sent_this_step, all_latencies, all_errors, backend) print(f"Completed all steps, generating reports...") @@ -886,16 +901,9 @@ def aggregate_benchmark_reports(reports: List[BenchmarkingReport]) -> Benchmarki "num_prompts_attempted": 0, "latencies": [], "server_metrics": [], - "errors": {}, + "errors": ErrorsReport(), } - def accumulate_errors(errors_list: List[Dict[str, int]]) -> Dict[str, int]: - accumulated_errors = init_errors_map() - for errors in errors_list: - for error_type, count in errors.items(): - accumulated_errors[error_type] += count - return accumulated_errors - for report in reports: # Input metavalidation asserts this report only has one step report report = report.steps[0] @@ -903,7 +911,7 @@ def accumulate_errors(errors_list: List[Dict[str, int]]) -> Dict[str, int]: aggregated_step_report["timestamp_end"] = max(aggregated_step_report["timestamp_end"], report["timestamp_end"]) aggregated_step_report["num_prompts_attempted"] += report["num_prompts_attempted"] aggregated_step_report["latencies"].extend(report["latencies"]) - aggregated_step_report["errors"] = accumulate_errors([aggregated_step_report["errors"], report["errors"]]) + aggregated_step_report["errors"] = aggregated_step_report["errors"].append_report(report["errors"]) aggregated_report = BenchmarkingReport(reports[0].args, f"ALL-{len(reports)}-MODELS", aggregated_step_report["timestamp_start"]) aggregated_report.record_metrics_for_step(**aggregated_step_report) From 6d9b0611cc7a5b9e83c3782f2a7f5f28092fb246 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Wed, 13 Nov 2024 22:48:57 +0000 Subject: [PATCH 26/27] streaming changes --- .../container/benchmark_serving.py | 236 ++++++++---------- 1 file changed, 104 insertions(+), 132 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index afa56834c..10fcbf1c1 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -38,6 +38,7 @@ prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)]) response_length_metric = Histogram("LatencyProfileGenerator:response_length", "Response length", buckets=[2**i for i in range(1, 16)]) tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request') +ttft_metric = Histogram('LatencyProfileGenerator:time_to_first_token', 'Time to first token per request') active_requests_metric = Gauge('LatencyProfileGenerator:active_requests', 'How many requests actively being processed') # Add trace config for monitoring in flight requests @@ -108,90 +109,6 @@ class Backend(ABC): An abstract base class for Backend that defines the interface for new model server backends. """ - - async def send_stream_request( - self, - backend: str, - api_url: str, - prompt: str, - prompt_len: int, - output_len: int, - best_of: int, - use_beam_search: bool, - top_k: int, - tokenizer: PreTrainedTokenizerBase, - sax_model: str, - model: str, - ) -> Tuple[Optional[Tuple[int, int, float]], Optional[float], Optional[ErrorsReport]]: - """Sends stream request to server""" - request_start_time = time.time() - errors = init_errors_map() - - headers = {"User-Agent": "Benchmark Client"} - if backend == "vllm": - pload = { - "model": model, - "prompt": prompt, - "n": 1, - "best_of": best_of, - "use_beam_search": use_beam_search, - "temperature": 0.0 if use_beam_search else 1.0, - "top_p": 1.0, - "max_tokens": output_len, - "ignore_eos": True, - "stream": True, - } - else: - raise ValueError(f"Unknown backend: {backend}") - - ttft = 0.0 - st = time.perf_counter() - output = "" - timeout = aiohttp.ClientTimeout(total=CLIENT_TIMEOUT_SEC) - async with aiohttp.ClientSession(timeout=timeout,trust_env=True) as session: - try: - async with session.post(api_url, headers=headers, json=pload, ssl=False) as response: - async for chunk_bytes in response.content.iter_chunks(): - chunk_bytes = chunk_bytes[0].strip() - if not chunk_bytes: - continue - timestamp = time.perf_counter() - # First token - if ttft == 0.0: - ttft = timestamp - st - - if chunk_bytes.decode("utf-8")[6:] != "[DONE]": - if backend == "vllm": - output += json.loads(chunk_bytes.decode("utf-8")[6:])["choices"][0]["text"] - except aiohttp.client_exceptions.ClientConnectorError as client_err: - errors["ClientConnectorError"] += 1 - print(f"ClientConnectorError: {client_err}") - return None, None, errors - except asyncio.TimeoutError as timeout_err: - errors["TimeoutError"] += 1 - print(f"TimeoutError: {timeout_err}") - return None, None, errors - except aiohttp.client_exceptions.ClientOSError as e: - errors["ClientOSError"] += 1 - print(f"ClientOSError: {e}") - return None, None, errors - except aiohttp.client_exceptions.ContentTypeError as e: - print(f"ContentTypeError: {e}, response: {response}") - errors["ContentTypeError"] += 1 - return None, None, errors - except aiohttp.client_exceptions.ServerDisconnectedError as e: - errors["ServerDisconnectedError"] += 1 - print(f"ServerDisconnectedError: {e}") - return None, None, errors - except Exception as e: - print(f"Unknown error {e}") - errors["unknown_error"] += 1 - return None, None, errors - request_end_time = time.time() - output_token_ids = tokenizer(output).input_ids - output_len = len(output_token_ids) - request_latency = (prompt_len, output_len, (request_end_time - request_start_time)) - return request_latency, ttft, None async def send_request( self, @@ -205,7 +122,8 @@ async def send_request( tokenizer: PreTrainedTokenizerBase, sax_model: str, model: str, - ) -> Tuple[Optional[Tuple[int, int, float]], Optional[float], Optional[ErrorsReport]] + streaming: bool, + ) -> Tuple[Optional[Tuple[int, int, float]], Optional[float], Optional[ErrorsReport]]: """Sends request to server.""" request_start_time = time.time() errors = ErrorsReport() @@ -220,22 +138,25 @@ async def send_request( tokenizer=tokenizer, sax_model=sax_model, model=model, + streaming=streaming ) # Set client timeout to be 3 hrs. timeout = aiohttp.ClientTimeout(total=CLIENT_TIMEOUT_SEC) + start_time = time.perf_counter() + output = "" + ttft = 0.0 async with aiohttp.ClientSession(timeout=timeout,trust_env=True) as session: while True: try: async with session.post(f"{api_url}/{self.get_endpoint()}", headers=headers, json=pload, ssl=False) as response: - output = await response.json() - + output, ttft = await self.results_from_response(response, streaming, start_time) # Re-send the request if it failed. if "error" not in output: break except Exception as e: errors.record_error(e) - return None, errors + return None, None, errors request_end_time = time.time() # Naive HF transformers generation and TensorRT-LLM generation stops at EOS # tokens and the generation may be shorter than the ground-truth output @@ -251,8 +172,10 @@ async def send_request( tpot_metric.observe((request_end_time - request_start_time) / output_len) prompt_length_metric.observe(prompt_len) response_length_metric.observe(output_len) + if ttft is not None: + ttft_metric.observe(ttft) - return request_latency, None + return request_latency, ttft, None @abstractmethod def create_request_payload(self, @@ -264,7 +187,8 @@ def create_request_payload(self, top_k: int, tokenizer: PreTrainedTokenizerBase, sax_model: str, - model: str) -> Dict: + model: str, + streaming: bool) -> Dict: pass @abstractmethod @@ -283,6 +207,12 @@ def get_server_metrics(self) -> List[str]: def get_endpoint(self) -> str: pass + async def results_from_response(self, response: aiohttp.ClientResponse, streaming: bool, start_time: float) -> Tuple[Dict, Optional[float]]: + if streaming: + raise Exception("This backend does not support parsing streaming responses") + else: + return await response.json() + class vLLMBackend(Backend): def get_server_metrics(self) -> List[str]: return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"] @@ -297,7 +227,8 @@ def create_request_payload(self, top_k: int, tokenizer: PreTrainedTokenizerBase, sax_model: str, - model: str): + model: str, + streaming: bool): return { "model": model, "prompt": prompt, @@ -308,7 +239,7 @@ def create_request_payload(self, "top_p": 1.0, "max_tokens": output_len, "ignore_eos": False, - "stream": False, + "stream": streaming, } def get_response_length( self, @@ -317,6 +248,34 @@ def get_response_length( tokenizer: PreTrainedTokenizerBase): output_token_ids = tokenizer(response["choices"][0]["text"]).input_ids return len(output_token_ids) + async def results_from_response(self, response: aiohttp.ClientResponse, streaming: bool, start_time: float) -> Tuple[Dict, Optional[float]]: + ttft = 0.0 + + # Make a streaming response look like a non streaming response for detokenizing later + output = { + 'choices': [{ + 'text' : "" + }] + } + if streaming: + async for chunk_bytes in response.content.iter_chunks(): + chunk_bytes = chunk_bytes[0].strip() + if not chunk_bytes: + continue + + timestamp = time.perf_counter() + + # Calculate Time-to-First-Token (TTFT) + if ttft == 0.0: + ttft = timestamp - start_time + + # Process the chunk if it's not the "[DONE]" message + if chunk_bytes.decode("utf-8")[6:] != "[DONE]": + output["choices"][0]["text"] += json.loads(chunk_bytes.decode("utf-8")[6:])["choices"][0]["text"] + return output, ttft + else: + res = await response.json() + return res, None class JetstreamBackend(Backend): def get_server_metrics(self) -> List[str]: @@ -335,7 +294,8 @@ def create_request_payload(self, top_k: int, tokenizer: PreTrainedTokenizerBase, sax_model: str, - model: str): + model: str, + streaming: bool): return { "prompt": prompt, "max_tokens": output_len, @@ -362,7 +322,8 @@ def create_request_payload(self, top_k: int, tokenizer: PreTrainedTokenizerBase, sax_model: str, - model: str): + model: str, + streaming: bool): return { "inputs": prompt, "parameters": { @@ -393,7 +354,8 @@ def create_request_payload(self, top_k: int, tokenizer: PreTrainedTokenizerBase, sax_model: str, - model: str): + model: str, + streaming: bool): return { "instances": [{ "prompt": prompt, @@ -426,7 +388,8 @@ def create_request_payload(self, top_k: int, tokenizer: PreTrainedTokenizerBase, sax_model: str, - model: str): + model: str, + streaming: bool): return { "text_input": prompt, "max_tokens": output_len, @@ -435,7 +398,7 @@ def create_request_payload(self, "top_p": 1.0, "bad_words": "", "stop_words": "", - "stream": False, + "stream": streaming, } def get_response_length( self, @@ -459,7 +422,8 @@ def create_request_payload(self, top_k: int, tokenizer: PreTrainedTokenizerBase, sax_model: str, - model: str): + model: str, + streaming: bool): return { "model": sax_model, "prompt": prompt, @@ -470,7 +434,7 @@ def create_request_payload(self, "top_p": 1.0, "top_k": 50, "max_tokens": output_len, - "stream": False, + "stream": streaming, } def get_response_length( self, @@ -531,6 +495,7 @@ def record_metrics_for_step( timestamp_end: float, num_prompts_attempted : int, latencies: List, + ttfts: List[float], errors: ErrorsReport, backend: Backend, ): @@ -664,15 +629,7 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float] if self.args.scrape_server_metrics: server_metrics = fetch_metrics_from_gmp(backend, total_time) - self.steps.append(BenchmarkingStepReport( - request_rate = request_rate, - timestamp_start = timestamp_start, - timestamp_end = timestamp_end, - num_prompts_attempted = num_prompts_attempted, - latencies = latencies, - ttfts = ttfts, - errors = errors, - local_metrics = [ + local_metrics = [ metric_sumamry_from_points( name="per_token_latency", description="seconds/token (includes waiting time on server)", @@ -687,11 +644,7 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float] name="per_output_token_latency", description="milliseconds/output_token (includes waiting time on server)", points=[1000 * latency / output_len for _, output_len, latency in latencies]), - metric_sumamry_from_points( - json_field_name="ttft", - name="time_to_first_token", - description="time to first token in seconds (includes waiting time on server)", - points=[1000 * latency / output_len for _, output_len, latency in latencies]), + metric_sumamry_from_points( name="input_length", description="length of prompt", @@ -705,7 +658,24 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float] description = "throughput in requests per second", mean = (len(latencies) / ((timestamp_end - timestamp_start) / NS_IN_SEC)), ), - ], + ] + if self.args.stream_request: + local_metrics.append(metric_sumamry_from_points( + json_field_name="ttft", + name="time_to_first_token", + description="Time to First Token (s)", + points=ttfts) + ) + + self.steps.append(BenchmarkingStepReport( + request_rate = request_rate, + timestamp_start = timestamp_start, + timestamp_end = timestamp_end, + num_prompts_attempted = num_prompts_attempted, + latencies = latencies, + ttfts = ttfts, + errors = errors, + local_metrics=local_metrics, server_metrics = server_metrics )) @@ -823,8 +793,8 @@ def to_json_report(self, write_to_file: bool = False) -> Dict: with open(file_name, "w", encoding="utf-8") as outfile: json.dump(output, outfile) if gcs_bucket is not None: - gcs_bucket.blob(f"{args.output_bucket_filepath}/{file_name}").upload_from_filename(file_name) - print(f"File {file_name} uploaded to gs://{args.output_bucket}/{args.output_bucket_filepath}") + gcs_bucket.blob(f"{self.args.output_bucket_filepath}/{file_name}").upload_from_filename(file_name) + print(f"File {file_name} uploaded to gs://{self.args.output_bucket}/{self.args.output_bucket_filepath}") return output @@ -947,7 +917,7 @@ async def benchmark( "max_num_prompts": args.num_prompts, }] } - benchmark_results = BenchmarkingReport(args, model, time.time_ns()) + benchmark_results = BenchmarkingReport(args, model, time.time_ns()) for index, step in enumerate(all_steps["steps"]): # No need to sleep before running the first step if args.job is not None and 'time_between_steps' in args.job and index != 0: @@ -969,18 +939,19 @@ async def benchmark( prompt, prompt_len, output_len = request task = asyncio.create_task( - backend.send_request( - f"http://{args.host}:{args.port}", - prompt, - prompt_len, - output_len, - args.best_of, - args.use_beam_search, - args.top_k, - tokenizer, - args.sax_model, - model, - ) + backend.send_request( + f"http://{args.host}:{args.port}", + prompt, + prompt_len, + output_len, + args.best_of, + args.use_beam_search, + args.top_k, + tokenizer, + args.sax_model, + model, + args.stream_request, + ) ) tasks.append(task) prompts_sent_this_step += 1 @@ -993,12 +964,11 @@ async def benchmark( all_latencies = [] all_ttfts = [] all_errors = ErrorsReport() - for latency, errors in results: + for latency, ttft, errors in results: if latency: all_latencies.append(latency) if errors: - for err, count in errors.items(): - all_errors.record_error(err) + all_errors.append_report(errors) if ttft: all_ttfts.append(ttft) benchmark_results.record_metrics_for_step(step['rate'], step_start_timestamp, step_end_timestamp, prompts_sent_this_step, all_latencies, all_ttfts, all_errors, backend) @@ -1016,6 +986,7 @@ def aggregate_benchmark_reports(reports: List[BenchmarkingReport]) -> Benchmarki "timestamp_end": 0.0, "num_prompts_attempted": 0, "latencies": [], + "ttfts": [], "server_metrics": [], "errors": ErrorsReport(), } @@ -1027,6 +998,7 @@ def aggregate_benchmark_reports(reports: List[BenchmarkingReport]) -> Benchmarki aggregated_step_report["timestamp_end"] = max(aggregated_step_report["timestamp_end"], report["timestamp_end"]) aggregated_step_report["num_prompts_attempted"] += report["num_prompts_attempted"] aggregated_step_report["latencies"].extend(report["latencies"]) + aggregated_step_report["ttfts"].extend(report["ttfts"]) aggregated_step_report["errors"] = aggregated_step_report["errors"].append_report(report["errors"]) aggregated_report = BenchmarkingReport(reports[0].args, f"ALL-{len(reports)}-MODELS", aggregated_step_report["timestamp_start"]) From a9f620ffb135535a69832249e5357217ae0c2af4 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Wed, 13 Nov 2024 22:57:27 +0000 Subject: [PATCH 27/27] step -> stage --- .../container/benchmark_serving.py | 198 +++++++++--------- 1 file changed, 99 insertions(+), 99 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 10fcbf1c1..3bad2c9f9 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -461,8 +461,8 @@ class MetricSummary(TypedDict, total=False): p90: Optional[float] p99: Optional[float] -class BenchmarkingStepReport(TypedDict): - """Result for one step""" +class BenchmarkingStageReport(TypedDict): + """Result for one stage""" request_rate: float timestamp_start: float timestamp_end: float @@ -474,10 +474,10 @@ class BenchmarkingStepReport(TypedDict): errors: ErrorsReport class BenchmarkingReport(): - """Results for all steps for a single model""" + """Results for all stages for a single model""" args: argparse.Namespace config: BenchmarkConfig - steps: List[BenchmarkingStepReport] + stages: List[BenchmarkingStageReport] def __init__(self, args : argparse.Namespace, model: str, start_time: float): self.args = args @@ -486,9 +486,9 @@ def __init__(self, args : argparse.Namespace, model: str, start_time: float): model_server = args.backend, start_time = start_time ) - self.steps = [] + self.stages = [] - def record_metrics_for_step( + def record_metrics_for_stage( self, request_rate: float, timestamp_start: float, @@ -667,7 +667,7 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float] points=ttfts) ) - self.steps.append(BenchmarkingStepReport( + self.stages.append(BenchmarkingStageReport( request_rate = request_rate, timestamp_start = timestamp_start, timestamp_end = timestamp_end, @@ -680,43 +680,43 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float] )) def to_text_reports(self, write_to_files: bool = False) -> List[str]: - """Each element in the output list is a report for each step""" + """Each element in the output list is a report for each stage""" output : Dict[str, str] = {} required_stats = ["latency", "throughput", "input_length", "output_length", "per_output_token_latency"] - for step in self.steps: - if not all(required_stat in [metric['name'] for metric in step['local_metrics']] for required_stat in required_stats): + for stage in self.stages: + if not all(required_stat in [metric['name'] for metric in stage['local_metrics']] for required_stat in required_stats): raise Exception(f"All of the following stats must be recorded: {required_stats}") - for step in self.steps: - step_output : List[str] = [] - total_time = (step['timestamp_end'] - step['timestamp_start']) / NS_IN_SEC - total_output_tokens = np.sum([output_len for _, output_len, _ in step['latencies']]) + for stage in self.stages: + stage_output : List[str] = [] + total_time = (stage['timestamp_end'] - stage['timestamp_start']) / NS_IN_SEC + total_output_tokens = np.sum([output_len for _, output_len, _ in stage['latencies']]) output_tokens_per_second = total_output_tokens / total_time output_tokens_per_min = 60 * output_tokens_per_second - total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in step['latencies']]) + total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in stage['latencies']]) input_tokens_per_min = 60 * total_input_tokens / total_time total_tokens = total_input_tokens + total_output_tokens tokens_per_min = 60 * total_tokens / total_time - step_output.append(f"====Result for Model: {self.config['model']}====") - step_output.append(f"Errors: {step['errors'].to_dict()}") - step_output.append(f"Total time: {total_time:.2f} s") - step_output.append(f"Successful/total requests: {len(step['latencies'])}/{step['num_prompts_attempted']}") - step_output.append(f"Requests/min: {60 * step['num_prompts_attempted'] / total_time:.2f}") - step_output.append(f"Output_tokens/min: {output_tokens_per_min:.2f}") - step_output.append(f"Input_tokens/min: {input_tokens_per_min:.2f}") - step_output.append(f"Tokens/min: {tokens_per_min:.2f}") + stage_output.append(f"====Result for Model: {self.config['model']}====") + stage_output.append(f"Errors: {stage['errors'].to_dict()}") + stage_output.append(f"Total time: {total_time:.2f} s") + stage_output.append(f"Successful/total requests: {len(stage['latencies'])}/{stage['num_prompts_attempted']}") + stage_output.append(f"Requests/min: {60 * stage['num_prompts_attempted'] / total_time:.2f}") + stage_output.append(f"Output_tokens/min: {output_tokens_per_min:.2f}") + stage_output.append(f"Input_tokens/min: {input_tokens_per_min:.2f}") + stage_output.append(f"Tokens/min: {tokens_per_min:.2f}") if self.args.machine_cost: - step_output.append( + stage_output.append( f"Cost $/1k tokens: {self.args.machine_cost * 1000 / (60 * output_tokens_per_min)}" ) - for metric in step['local_metrics']: - step_output.append(f"Average {metric['description']}:" f" {metric['mean']:.2f}") - output_filename = f"latency-profile-{datetime.fromtimestamp(step['timestamp_start'] / NS_IN_SEC).strftime('%Y-%m-%d_%H-%M-%S')}.txt" - output[output_filename] = '\n'.join(step_output) + for metric in stage['local_metrics']: + stage_output.append(f"Average {metric['description']}:" f" {metric['mean']:.2f}") + output_filename = f"latency-profile-{datetime.fromtimestamp(stage['timestamp_start'] / NS_IN_SEC).strftime('%Y-%m-%d_%H-%M-%S')}.txt" + output[output_filename] = '\n'.join(stage_output) if write_to_files: with open(output_filename, 'w') as file: file.write(output[output_filename]) @@ -725,34 +725,34 @@ def to_text_reports(self, write_to_files: bool = False) -> List[str]: print(f"File {output_filename} uploaded to gs://{args.output_bucket}/{args.output_bucket_filepath}") return list(output.values()) - # The output is a a single json summary of all steps + # The output is a a single json summary of all stages def to_json_report(self, write_to_file: bool = False) -> Dict: output = { "config": { **self.config, "num_models": len(self.args.models) if self.args.save_aggregated_result else 1, "start_time": { - "seconds" : self.steps[0]["timestamp_start"] // NS_IN_SEC, - "nanos" : self.steps[0]["timestamp_start"] % NS_IN_SEC, + "seconds" : self.stages[0]["timestamp_start"] // NS_IN_SEC, + "nanos" : self.stages[0]["timestamp_start"] % NS_IN_SEC, }, }, "summary_stats": { "stats": [ { - "request_rate": step["request_rate"], + "request_rate": stage["request_rate"], **{(metric["json_field_name"] if "json_field_name" in metric else metric["name"]): { stat: value for stat, value in metric.items() if stat not in ["name", "description", "json_field_name"] and value is not None } - for metric in step["local_metrics"] + for metric in stage["local_metrics"] }, "model_server_metrics": [ {"name": server_metric["name"], **server_metric} - for server_metric in step["server_metrics"] - ] if step["server_metrics"] is not None else [] + for server_metric in stage["server_metrics"] + ] if stage["server_metrics"] is not None else [] } - for step in self.steps + for stage in self.stages ] }, @@ -766,23 +766,23 @@ def to_json_report(self, write_to_file: bool = False) -> Dict: # Legacy use case, use summary_stats if possible "metrics": { # Traffic metrics - "num_prompts": self.steps[0]['num_prompts_attempted'], - "request_rate": self.steps[0]['request_rate'], - "benchmark_time": (self.steps[0]['timestamp_end'] - self.steps[0]['timestamp_start']) / NS_IN_SEC, - "throughput_rps": (len(self.steps[0]['latencies']) / ((self.steps[0]['timestamp_end'] - self.steps[0]['timestamp_start']) / NS_IN_SEC)), - "throughput": np.sum([output_len for _, output_len, _ in self.steps[0]['latencies']]) / ((self.steps[0]['timestamp_end'] - self.steps[0]['timestamp_start']) / NS_IN_SEC), + "num_prompts": self.stages[0]['num_prompts_attempted'], + "request_rate": self.stages[0]['request_rate'], + "benchmark_time": (self.stages[0]['timestamp_end'] - self.stages[0]['timestamp_start']) / NS_IN_SEC, + "throughput_rps": (len(self.stages[0]['latencies']) / ((self.stages[0]['timestamp_end'] - self.stages[0]['timestamp_start']) / NS_IN_SEC)), + "throughput": np.sum([output_len for _, output_len, _ in self.stages[0]['latencies']]) / ((self.stages[0]['timestamp_end'] - self.stages[0]['timestamp_start']) / NS_IN_SEC), **{ f"{'avg' if stat == 'mean' else stat}_{metric['name']}": value - for metric in self.steps[0]["local_metrics"] + for metric in self.stages[0]["local_metrics"] if "json_field_name" in metric for stat, value in metric.items() if stat not in ["name", "description", "json_field_name"] and value is not None }, "server_metrics": { server_metric["name"]: {k.capitalize(): v for k, v in server_metric.items() if k != "name"} - for server_metric in self.steps[0]["server_metrics"] - } if self.steps[0]["server_metrics"] is not None else {} - } if len(self.steps) == 1 else None + for server_metric in self.stages[0]["server_metrics"] + } if self.stages[0]["server_metrics"] is not None else {} + } if len(self.stages) == 1 else None } if write_to_file: @@ -907,34 +907,34 @@ async def benchmark( args.use_dummy_text, ) - all_steps = {} + all_stages = {} if args.job is not None: - all_steps = args.job + all_stages = args.job elif args.num_prompts is not None: - all_steps = { - "steps": [{ + all_stages = { + "stages": [{ "rate": args.request_rate, "max_num_prompts": args.num_prompts, }] } benchmark_results = BenchmarkingReport(args, model, time.time_ns()) - for index, step in enumerate(all_steps["steps"]): - # No need to sleep before running the first step - if args.job is not None and 'time_between_steps' in args.job and index != 0: - print(f"Sleeping for {args.job['time_between_steps']} sec...") - await asyncio.sleep(args.job["time_between_steps"]) - max_prompts = f" {step['max_num_prompts']} requests" if 'max_num_prompts' in step else "" - duration = f" {step['time']} sec" if 'time' in step else " " - print(f"Starting benchmarking{max_prompts} at {step['rate']} requests/sec for{duration}") + for index, stage in enumerate(all_stages["stages"]): + # No need to sleep before running the first stage + if args.job is not None and 'time_between_stages' in args.job and index != 0: + print(f"Sleeping for {args.job['time_between_stages']} sec...") + await asyncio.sleep(args.job["time_between_stages"]) + max_prompts = f" {stage['max_num_prompts']} requests" if 'max_num_prompts' in stage else "" + duration = f" {stage['time']} sec" if 'time' in stage else " " + print(f"Starting benchmarking{max_prompts} at {stage['rate']} requests/sec for{duration}") tasks: List[asyncio.Task] = [] - prompts_sent_this_step: int = 0 - step_start_timestamp = time.time_ns() - async for request in generate_next_request(input_requests, str(step["rate"]), step_start_timestamp): + prompts_sent_this_stage: int = 0 + stage_start_timestamp = time.time_ns() + async for request in generate_next_request(input_requests, str(stage["rate"]), stage_start_timestamp): # Stop conditions - if "max_num_prompts" in step and prompts_sent_this_step >= step["max_num_prompts"]: + if "max_num_prompts" in stage and prompts_sent_this_stage >= stage["max_num_prompts"]: break - if "time" in step and ((time.time_ns() - step_start_timestamp ) / NS_IN_SEC) > step["time"]: + if "time" in stage and ((time.time_ns() - stage_start_timestamp ) / NS_IN_SEC) > stage["time"]: break prompt, prompt_len, output_len = request @@ -954,12 +954,12 @@ async def benchmark( ) ) tasks.append(task) - prompts_sent_this_step += 1 + prompts_sent_this_stage += 1 print("All requests sent, awaiting responses...") results = await asyncio.gather(*tasks) - step_end_timestamp = time.time_ns() - print(f"Finished benchmarking step {index + 1}") + stage_end_timestamp = time.time_ns() + print(f"Finished benchmarking stage {index + 1}") all_latencies = [] all_ttfts = [] @@ -971,17 +971,17 @@ async def benchmark( all_errors.append_report(errors) if ttft: all_ttfts.append(ttft) - benchmark_results.record_metrics_for_step(step['rate'], step_start_timestamp, step_end_timestamp, prompts_sent_this_step, all_latencies, all_ttfts, all_errors, backend) + benchmark_results.record_metrics_for_stage(stage['rate'], stage_start_timestamp, stage_end_timestamp, prompts_sent_this_stage, all_latencies, all_ttfts, all_errors, backend) - print(f"Completed all steps, generating reports...") + print(f"Completed all stages, generating reports...") return benchmark_results def aggregate_benchmark_reports(reports: List[BenchmarkingReport]) -> BenchmarkingReport: """When benchmarking multiple models we will generate a BenchmarkingReport for each.""" """If `save_aggregated_result` is set, we aggregate these into a single report.""" - aggregated_step_report = { - "request_rate": reports[0].steps[0]["request_rate"], + aggregated_stage_report = { + "request_rate": reports[0].stages[0]["request_rate"], "timestamp_start": 0.0, "timestamp_end": 0.0, "num_prompts_attempted": 0, @@ -992,17 +992,17 @@ def aggregate_benchmark_reports(reports: List[BenchmarkingReport]) -> Benchmarki } for report in reports: - # Input metavalidation asserts this report only has one step report - report = report.steps[0] - aggregated_step_report["timestamp_start"] = min(aggregated_step_report["timestamp_start"], report["timestamp_start"]) - aggregated_step_report["timestamp_end"] = max(aggregated_step_report["timestamp_end"], report["timestamp_end"]) - aggregated_step_report["num_prompts_attempted"] += report["num_prompts_attempted"] - aggregated_step_report["latencies"].extend(report["latencies"]) - aggregated_step_report["ttfts"].extend(report["ttfts"]) - aggregated_step_report["errors"] = aggregated_step_report["errors"].append_report(report["errors"]) - - aggregated_report = BenchmarkingReport(reports[0].args, f"ALL-{len(reports)}-MODELS", aggregated_step_report["timestamp_start"]) - aggregated_report.record_metrics_for_step(**aggregated_step_report) + # Input metavalidation asserts this report only has one stage report + report = report.stages[0] + aggregated_stage_report["timestamp_start"] = min(aggregated_stage_report["timestamp_start"], report["timestamp_start"]) + aggregated_stage_report["timestamp_end"] = max(aggregated_stage_report["timestamp_end"], report["timestamp_end"]) + aggregated_stage_report["num_prompts_attempted"] += report["num_prompts_attempted"] + aggregated_stage_report["latencies"].extend(report["latencies"]) + aggregated_stage_report["ttfts"].extend(report["ttfts"]) + aggregated_stage_report["errors"] = aggregated_stage_report["errors"].append_report(report["errors"]) + + aggregated_report = BenchmarkingReport(reports[0].args, f"ALL-{len(reports)}-MODELS", aggregated_stage_report["timestamp_start"]) + aggregated_report.record_metrics_for_stage(**aggregated_stage_report) return aggregated_report @@ -1055,7 +1055,7 @@ def input_metavalidation(args: argparse.Namespace): raise ValueError("All args must be set for one and only one of the following sets of arguments: {--request-rate, --num-prompts} or {--job}") if args.save_aggregated_result and args.benchmark is not None and len(args.benchmark) != 1 and args.models is not None and len(args.models) > 1: - raise ValueError("Multi model benchmarking with multi step benchmarking is not supported yet") + raise ValueError("Multi model benchmarking with multi stage benchmarking is not supported yet") if args.use_beam_search and args.backend == "tgi": raise ValueError("Beam search is not supported by TGI") @@ -1182,33 +1182,33 @@ def parse_request_rates(input_str): request_data = json.loads(input_str) # Validate that the JSON has the correct structure if not isinstance(request_data, dict): - raise argparse.ArgumentTypeError("Input JSON must be an object containing 'time_between_steps' and 'steps'.") - # Check 'time_between_steps' field - if "time_between_steps" not in request_data or (not isinstance(request_data["time_between_steps"], float) and not isinstance(request_data["time_between_steps"], int)): - raise argparse.ArgumentTypeError("'time_between_steps' must be a float or int.") - # Check 'steps' field - if "steps" not in request_data or not isinstance(request_data["steps"], list): - raise argparse.ArgumentTypeError("'steps' must be a list of objects with 'rate' and 'time'.") + raise argparse.ArgumentTypeError("Input JSON must be an object containing 'time_between_stages' and 'stages'.") + # Check 'time_between_stages' field + if "time_between_stages" not in request_data or (not isinstance(request_data["time_between_stages"], float) and not isinstance(request_data["time_between_stages"], int)): + raise argparse.ArgumentTypeError("'time_between_stages' must be a float or int.") + # Check 'stages' field + if "stages" not in request_data or not isinstance(request_data["stages"], list): + raise argparse.ArgumentTypeError("'stages' must be a list of objects with 'rate' and 'time'.") - # Validate each entry in the 'steps' list - for i, rate_entry in enumerate(request_data["steps"]): + # Validate each entry in the 'stages' list + for i, rate_entry in enumerate(request_data["stages"]): if not isinstance(rate_entry, dict): - raise argparse.ArgumentTypeError(f"Entry {i} in 'steps' must be a JSON object.") + raise argparse.ArgumentTypeError(f"Entry {i} in 'stages' must be a JSON object.") if "rate" not in rate_entry: - raise argparse.ArgumentTypeError(f"Entry {i} in 'steps' must have a 'rate' key.") + raise argparse.ArgumentTypeError(f"Entry {i} in 'stages' must have a 'rate' key.") if "time" not in rate_entry and "max_num_prompts" not in rate_entry: - raise argparse.ArgumentTypeError(f"Entry {i} in 'steps' must have a 'time' and/or 'max_num_prompts' key.") + raise argparse.ArgumentTypeError(f"Entry {i} in 'stages' must have a 'time' and/or 'max_num_prompts' key.") # Validate the 'rate' field to allow for string expressions or floats if isinstance(rate_entry["rate"], str): try: is_expression_of_t(rate_entry["rate"]) # Validate the expression except Exception as e: - raise argparse.ArgumentTypeError(f"Entry {i} in 'steps': {e}") + raise argparse.ArgumentTypeError(f"Entry {i} in 'stages': {e}") # Validate the 'time' field if not isinstance(rate_entry["time"], (float, int)): - raise argparse.ArgumentTypeError(f"Entry {i} in 'steps': 'time' must be a positive float.") + raise argparse.ArgumentTypeError(f"Entry {i} in 'stages': 'time' must be a positive float.") return request_data except json.JSONDecodeError as e: raise argparse.ArgumentTypeError("Invalid JSON format") @@ -1223,18 +1223,18 @@ def parse_request_rates(input_str): " or as a filename. \n" " The JSON should have the following structure:\n\n" " {\n" - " \"time_between_steps\": float (seconds to rest between rates),\n" + " \"time_between_stages\": float (seconds to rest between rates),\n" " \"rates\": [\n" " {\n" " \"rate\": float | str (as would be passed to request-rate),\n" - " \"time\": float (number of seconds for this step)\n" - " \"max_num_prompts\": int (maximum number of prompts for this step)" + " \"time\": float (number of seconds for this stage)\n" + " \"max_num_prompts\": int (maximum number of prompts for this stage)" " },\n" " ...\n" " ]\n" " }\n\n" " Example JSON:\n" - " '{\"time_between_steps\": 1.0, \"rates\": [{\"rate\": 2.0, \"time\": 0.0}, {\"rate\": \"1+0.5*t\", \"time\": 5.0}]}'\n\n" + " '{\"time_between_stages\": 1.0, \"rates\": [{\"rate\": 2.0, \"time\": 0.0}, {\"rate\": \"1+0.5*t\", \"time\": 5.0}]}'\n\n" " Each entry should have a 'rate' and/or 'num_prompts' and 'time' value." " Each rate is finished when \"num_prompts\" prompts are sent" " (if specified) and \"time\" seconds have passed (if specified),"