From 2f81a2d4db480b61607871624c5c03269167625c Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Mon, 28 Oct 2024 22:54:57 +0000
Subject: [PATCH 01/27] first commit

---
 .../container/benchmark_serving.py            | 50 ++++++++++++++-----
 .../container/requirements.txt                |  1 +
 2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index 37ecdb570..e2a28c579 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -20,6 +20,8 @@
 
 import aiohttp
 import numpy as np
+from sympy import symbols
+from sympy.parsing.sympy_parser import parse_expr
 from transformers import AutoTokenizer
 from transformers import PreTrainedTokenizerBase
 
@@ -96,18 +98,24 @@ def sample_requests(
 
 async def get_request(
     input_requests: List[Tuple[str, int, int]],
-    request_rate: float,
+    request_rate_expr: str,
+    start_time: float,
 ) -> AsyncGenerator[Tuple[str, int, int], None]:
   """Gets request async."""
-  input_requests = iter(input_requests)
   for request in input_requests:
     yield request
 
-    if request_rate == float("inf"):
+    if request_rate_expr == float("inf"):
       # If the request rate is infinity, then we don't need to wait.
       continue
+
+    # Evaluate the reqest rate at this point in time
+    t = symbols('t')
+    expr_parsed = parse_expr(request_rate_expr, transformations="all", local_dict={"t": t})
+    request_rate_at_t = expr_parsed.subs(t, ((time.time_ns() - start_time) / 1000000000))
+
     # Sample the request interval from the exponential distribution.
-    interval = np.random.exponential(1.0 / request_rate)
+    interval = np.random.exponential(1.0 / request_rate_at_t)
     # The next request will be sent after the interval.
     await asyncio.sleep(interval)
 
@@ -134,7 +142,7 @@ async def send_request(
     tokenizer: PreTrainedTokenizerBase,
     sax_model: str,
     model: str,
-) -> Tuple[Tuple[int, int, float], Dict[str, int]]:
+) -> Tuple[Optional[Tuple[int, int, float]], Optional[Dict[str, int]]]:
   """Sends request to server."""
   request_start_time = time.time()
   errors = init_errors_map()
@@ -291,9 +299,9 @@ async def benchmark(
       tokenizer,
       args.use_dummy_text,
   )
-  benchmark_start_time = time.time()
+  benchmark_start_time = time.time_ns()
   tasks: List[asyncio.Task] = []
-  async for request in get_request(input_requests, args.request_rate):
+  async for request in get_request(input_requests, args.request_rate, benchmark_start_time):
     prompt, prompt_len, output_len = request
     task = asyncio.create_task(
         send_request(
@@ -321,7 +329,7 @@ async def benchmark(
       for err, count in errors.items():
         combined_errors[err] = combined_errors[err] + count
   
-  benchmark_duration = time.time() - benchmark_start_time
+  benchmark_duration = (time.time_ns() - benchmark_start_time) / 1000000000
   print_and_save_result(args, benchmark_duration, len(input_requests), model, combined_latencies, combined_errors)
   return combined_latencies, combined_errors
 
@@ -599,6 +607,22 @@ async def main(args: argparse.Namespace):
     else args.endpoint
 )
 
+  # Input assertions
+  def is_expression_of_t(expression):
+    # Check if expression uses variables other than 't'
+    try:
+        # Attempt to evaluate with only 't' defined
+        t = symbols('t')
+        expr_parsed = parse_expr(expression, transformations="all", local_dict={"t": t})
+        expr_parsed.subs(t, 1)
+        return True
+    except KeyError as e:
+        # If another variable is required, it will throw a KeyError
+        return False
+  if not is_expression_of_t(args.request_rate):
+      raise ValueError(f"Request rate {args.request_rate}, must be an expression of `t`")
+
+
   print(f"Starting Prometheus Server on port {PROMETHEUS_PORT}")
   start_http_server(PROMETHEUS_PORT)
 
@@ -607,8 +631,8 @@ async def main(args: argparse.Namespace):
       args.tokenizer, trust_remote_code=args.trust_remote_code
   )
 
-  benchmark_start_time = time.time()
-  args.start_datetime = datetime.fromtimestamp(benchmark_start_time)
+  benchmark_start_time = time.time_ns()
+  args.start_datetime = datetime.fromtimestamp(benchmark_start_time / 1000000000)
   
   results = await asyncio.gather(
             *[benchmark(args, api_url, tokenizer, model) for model in models]
@@ -629,7 +653,7 @@ async def main(args: argparse.Namespace):
     for k, v in errors.items():
       combined_errors[k] = combined_errors[k] + v
   
-  benchmark_duration_all_models = time.time() - benchmark_start_time
+  benchmark_duration_all_models = (time.time_ns() - benchmark_start_time) / 1000000000
   if args.save_aggregated_result:
     print_and_save_result(args, benchmark_duration_all_models, len(models)*args.num_prompts, f"ALL-{len(models)}-MODELS", combined_latencies, combined_errors)
 
@@ -713,8 +737,8 @@ async def main(args: argparse.Namespace):
   )
   parser.add_argument(
       "--request-rate",
-      type=float,
-      default=float("inf"),
+      type=str,
+      default="inf",
       help=(
           "Number of requests per second. If this is inf, "
           "then all the requests are sent at time 0. "
diff --git a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt
index a9f6d99a6..df46317a0 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt
+++ b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt
@@ -24,6 +24,7 @@ psutil
 ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0
+sympy <= 1.13
 torch == 2.1.1
 transformers >= 4.42.0 # Required for Qwen2
 xformers == 0.0.23

From 837554b859e266949d1d08caa0a5b6d033ac7149 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Mon, 28 Oct 2024 22:56:09 +0000
Subject: [PATCH 02/27] nit

---
 .../tools/profile-generator/container/benchmark_serving.py     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index e2a28c579..3602bf068 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -609,9 +609,8 @@ async def main(args: argparse.Namespace):
 
   # Input assertions
   def is_expression_of_t(expression):
-    # Check if expression uses variables other than 't'
+    # Check if expression uses variables other than 't' by attempting to evaluate with only 't' defined
     try:
-        # Attempt to evaluate with only 't' defined
         t = symbols('t')
         expr_parsed = parse_expr(expression, transformations="all", local_dict={"t": t})
         expr_parsed.subs(t, 1)

From fe980bf162dedfd4dd273df5f3ddf3d94bdf53c9 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Mon, 28 Oct 2024 22:58:04 +0000
Subject: [PATCH 03/27] ns to sec constant

---
 .../tools/profile-generator/container/benchmark_serving.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index 3602bf068..3879dfc37 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -31,6 +31,7 @@
 CLIENT_TIMEOUT_SEC = 3 * 60 * 60
 NEW_TEXT_KEY = "\nOutput:\n"
 PROMETHEUS_PORT = 9090
+NS_IN_SEC = 1000 * 1000 * 1000
 
 # Prometheus Metrics
 prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)])
@@ -329,7 +330,7 @@ async def benchmark(
       for err, count in errors.items():
         combined_errors[err] = combined_errors[err] + count
   
-  benchmark_duration = (time.time_ns() - benchmark_start_time) / 1000000000
+  benchmark_duration = (time.time_ns() - benchmark_start_time) / NS_IN_SEC
   print_and_save_result(args, benchmark_duration, len(input_requests), model, combined_latencies, combined_errors)
   return combined_latencies, combined_errors
 
@@ -631,7 +632,7 @@ def is_expression_of_t(expression):
   )
 
   benchmark_start_time = time.time_ns()
-  args.start_datetime = datetime.fromtimestamp(benchmark_start_time / 1000000000)
+  args.start_datetime = datetime.fromtimestamp(benchmark_start_time / NS_IN_SEC)
   
   results = await asyncio.gather(
             *[benchmark(args, api_url, tokenizer, model) for model in models]
@@ -652,7 +653,7 @@ def is_expression_of_t(expression):
     for k, v in errors.items():
       combined_errors[k] = combined_errors[k] + v
   
-  benchmark_duration_all_models = (time.time_ns() - benchmark_start_time) / 1000000000
+  benchmark_duration_all_models = (time.time_ns() - benchmark_start_time) / NS_IN_SEC
   if args.save_aggregated_result:
     print_and_save_result(args, benchmark_duration_all_models, len(models)*args.num_prompts, f"ALL-{len(models)}-MODELS", combined_latencies, combined_errors)
 

From 28dcf340d10464b36ddb2be4ee7668ad1a62204f Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Mon, 28 Oct 2024 23:02:24 +0000
Subject: [PATCH 04/27] properly handle infinity

---
 .../tools/profile-generator/container/benchmark_serving.py   | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index 3879dfc37..d312cf52b 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -619,7 +619,10 @@ def is_expression_of_t(expression):
     except KeyError as e:
         # If another variable is required, it will throw a KeyError
         return False
-  if not is_expression_of_t(args.request_rate):
+
+  if args.request_rate == "inf":
+    args.request_rate = "oo"
+  if and not is_expression_of_t(args.request_rate):
       raise ValueError(f"Request rate {args.request_rate}, must be an expression of `t`")
 
 

From 3715a2ace6028ea5ac8fc551f8eb88c23aedf4b8 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Mon, 28 Oct 2024 23:03:23 +0000
Subject: [PATCH 05/27] nit

---
 .../tools/profile-generator/container/benchmark_serving.py   | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index d312cf52b..293addca0 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -106,7 +106,7 @@ async def get_request(
   for request in input_requests:
     yield request
 
-    if request_rate_expr == float("inf"):
+    if request_rate_expr == "oo":
       # If the request rate is infinity, then we don't need to wait.
       continue
 
@@ -622,10 +622,9 @@ def is_expression_of_t(expression):
 
   if args.request_rate == "inf":
     args.request_rate = "oo"
-  if and not is_expression_of_t(args.request_rate):
+  if not is_expression_of_t(args.request_rate):
       raise ValueError(f"Request rate {args.request_rate}, must be an expression of `t`")
 
-
   print(f"Starting Prometheus Server on port {PROMETHEUS_PORT}")
   start_http_server(PROMETHEUS_PORT)
 

From dab6d67aaf10f94370f8158a324607e337f8ee9c Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 29 Oct 2024 16:15:18 +0000
Subject: [PATCH 06/27] ns in s

---
 .../tools/profile-generator/container/benchmark_serving.py    | 4 ++--
 manifest.yaml                                                 | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 create mode 100644 manifest.yaml

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index 293addca0..e1981e8cc 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -31,7 +31,7 @@
 CLIENT_TIMEOUT_SEC = 3 * 60 * 60
 NEW_TEXT_KEY = "\nOutput:\n"
 PROMETHEUS_PORT = 9090
-NS_IN_SEC = 1000 * 1000 * 1000
+NS_IN_SEC = 1_000_000_000
 
 # Prometheus Metrics
 prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)])
@@ -113,7 +113,7 @@ async def get_request(
     # Evaluate the reqest rate at this point in time
     t = symbols('t')
     expr_parsed = parse_expr(request_rate_expr, transformations="all", local_dict={"t": t})
-    request_rate_at_t = expr_parsed.subs(t, ((time.time_ns() - start_time) / 1000000000))
+    request_rate_at_t = expr_parsed.subs(t, ((time.time_ns() - start_time) / NS_IN_SEC))
 
     # Sample the request interval from the exponential distribution.
     interval = np.random.exponential(1.0 / request_rate_at_t)
diff --git a/manifest.yaml b/manifest.yaml
new file mode 100644
index 000000000..e69de29bb

From 4bcd688e15585a9d52230426b04fd18fb8ed564e Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 29 Oct 2024 16:15:41 +0000
Subject: [PATCH 07/27] remove manifest.yaml

---
 manifest.yaml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 manifest.yaml

diff --git a/manifest.yaml b/manifest.yaml
deleted file mode 100644
index e69de29bb..000000000

From 68c3283455c83b23c3d3d5d9d93eb3b0ab0550e1 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 29 Oct 2024 16:16:29 +0000
Subject: [PATCH 08/27] better comment

---
 .../tools/profile-generator/container/benchmark_serving.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index e1981e8cc..c77d7be97 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -617,7 +617,7 @@ def is_expression_of_t(expression):
         expr_parsed.subs(t, 1)
         return True
     except KeyError as e:
-        # If another variable is required, it will throw a KeyError
+        # If another variable is required, throw a KeyError
         return False
 
   if args.request_rate == "inf":

From b631214b6f42d4e33bbc8f2d23b4b89e845a0e6f Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 29 Oct 2024 16:29:45 +0000
Subject: [PATCH 09/27] better flag message

---
 .../profile-generator/container/benchmark_serving.py  | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index c77d7be97..4c2a6d764 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -742,11 +742,12 @@ def is_expression_of_t(expression):
       type=str,
       default="inf",
       help=(
-          "Number of requests per second. If this is inf, "
-          "then all the requests are sent at time 0. "
-          "Otherwise, we use Poisson process to synthesize "
-          "the request arrival times."
-      ),
+          "Specifies the request rate as a function of time, f(t)."
+          " Example format: '1+1.05*t', where 't' represents seconds."
+          " If set to 'inf', all requests are sent at time 0. Otherwise,"
+          " the function is interpreted to generate a Poisson process"
+          " for request arrival times based on the provided rate expression."
+    ),
   )
   parser.add_argument("--seed", type=int, default=int(time.time()))
   parser.add_argument(

From 0cb280855486aae3c29384d35c553547716cc718 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 29 Oct 2024 16:46:44 +0000
Subject: [PATCH 10/27] remove request rate from filename

---
 .../tools/profile-generator/container/benchmark_serving.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index 4c2a6d764..fbea2ea08 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -420,7 +420,7 @@ def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics
   # Save to file
   model_without_slash = model.replace("/","-")
   file_name = (
-      f"{args.file_prefix}-{args.backend}-{args.request_rate}qps-{args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json"
+      f"{args.file_prefix}-{args.backend}-{args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json"
   )
   with open(file_name, "w", encoding="utf-8") as outfile:
     json.dump(final_json, outfile)

From 4e7cfada7b004fb0ea6b4bec4d3db8e67bf15e57 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 29 Oct 2024 16:49:13 +0000
Subject: [PATCH 11/27] tweak description

---
 .../profile-generator/container/benchmark_serving.py     | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index fbea2ea08..c90979c95 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -743,10 +743,11 @@ def is_expression_of_t(expression):
       default="inf",
       help=(
           "Specifies the request rate as a function of time, f(t)."
-          " Example format: '1+1.05*t', where 't' represents seconds."
-          " If set to 'inf', all requests are sent at time 0. Otherwise,"
-          " the function is interpreted to generate a Poisson process"
-          " for request arrival times based on the provided rate expression."
+          " Example format: '1+1.05*t', where 't' represents seconds from"
+          " start. If set to 'inf', all requests are sent at time 0."
+          " Otherwise, the function is interpreted to generate a Poisson"
+          " process for request arrival times based on the provided rate"
+          " expression."
     ),
   )
   parser.add_argument("--seed", type=int, default=int(time.time()))

From d10d86023d6e4eb8330737e7610c96afaea3777c Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 29 Oct 2024 16:54:34 +0000
Subject: [PATCH 12/27] typo

---
 .../tools/profile-generator/container/benchmark_serving.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index c90979c95..a4d253b6c 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -110,7 +110,7 @@ async def get_request(
       # If the request rate is infinity, then we don't need to wait.
       continue
 
-    # Evaluate the reqest rate at this point in time
+    # Evaluate the request rate at this point in time
     t = symbols('t')
     expr_parsed = parse_expr(request_rate_expr, transformations="all", local_dict={"t": t})
     request_rate_at_t = expr_parsed.subs(t, ((time.time_ns() - start_time) / NS_IN_SEC))

From 53744df402caafe9c7ec74dd9e39d7c75167fd4a Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Mon, 11 Nov 2024 22:44:05 +0000
Subject: [PATCH 13/27] refactoring

---
 .../container/benchmark_serving.py            | 1201 +++++++++++------
 1 file changed, 793 insertions(+), 408 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index a4d253b6c..f7c08cc26 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -5,6 +5,7 @@
 It currently supports TGI, vLLM, Triton TensorRT-LLM and Saxml.
 """
 
+from abc import ABC, abstractmethod
 import argparse
 import asyncio
 from datetime import datetime
@@ -12,7 +13,8 @@
 import random
 import requests
 import time
-from typing import AsyncGenerator, List, Optional, Tuple, Dict
+import os
+from typing import AsyncGenerator, List, Optional, Tuple, Dict, TypedDict
 from prometheus_client import start_http_server, Histogram
 
 import google.auth
@@ -25,8 +27,6 @@
 from transformers import AutoTokenizer
 from transformers import PreTrainedTokenizerBase
 
-from google.protobuf.timestamp_pb2 import Timestamp
-
 MIN_SEQ_LEN = 4
 CLIENT_TIMEOUT_SEC = 3 * 60 * 60
 NEW_TEXT_KEY = "\nOutput:\n"
@@ -37,73 +37,537 @@
 prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)])
 response_length_metric = Histogram("LatencyProfileGenerator:response_length", "Response length", buckets=[2**i for i in range(1, 16)])
 tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request')
+  
+class BenchmarkConfig(TypedDict):
+    model: str
+    model_server: str
+    start_time: float
 
-def sample_requests(
-    dataset_path: str,
-    num_requests: int,
-    max_input_len: int,
-    max_output_len: int,
-    tokenizer: PreTrainedTokenizerBase,
-    use_dummy_text: bool,
-) -> List[Tuple[str, int, int]]:
-  """Samples requests from the dataset or creates dummy requests."""
-  if use_dummy_text:
-    dummy_prompt_token_ids = [0] * max_input_len
-    dummy_prompt = tokenizer.decode(dummy_prompt_token_ids)
-    dummy_requests = [(
-        dummy_prompt,
-        max_input_len,
-        max_output_len,
-    )] * num_requests
-    return dummy_requests
-
-  # Load the dataset.
-  with open(dataset_path) as f:
-    dataset = json.load(f)
-  # Filter out the conversations with less than 2 turns.
-  dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-  # Only keep the first two turns of each conversation.
-  dataset = [
-      (data["conversations"][0]["value"], data["conversations"][1]["value"])
-      for data in dataset
-  ]
-
-  # Tokenize the prompts and completions.
-  prompts = [prompt for prompt, _ in dataset]
-  prompt_token_ids = tokenizer(prompts).input_ids
-  completions = [completion for _, completion in dataset]
-  completion_token_ids = tokenizer(completions).input_ids
-  tokenized_dataset = []
-  for i in range(len(dataset)):
-    output_len = len(completion_token_ids[i])
-    tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
-
-  # Filter out too long sequences.
-  filtered_dataset: List[Tuple[str, int, int]] = []
-  for prompt, prompt_token_ids, output_len in tokenized_dataset:
-    prompt_len = len(prompt_token_ids)
-    if prompt_len < MIN_SEQ_LEN or output_len < MIN_SEQ_LEN:
-      # Prune too short sequences.
-      # This is because TGI causes errors when the input or output length
-      # is too short.
-      continue
-    if prompt_len > max_input_len or output_len > max_output_len:
-      # Prune too long sequences.
-      continue
-    filtered_dataset.append((prompt, prompt_len, output_len))
+class MetricSummary(TypedDict, total=False):
+  short_name:   Optional[str]
+  name:         str
+  description:  str
+  mean:         float
+  median:       Optional[float]
+  sd:           Optional[float]
+  min:          Optional[float]
+  max:          Optional[float]
+  p90:          Optional[float]
+  p99:          Optional[float]
+
+class BenchmarkingStepReport(TypedDict):
+  """Result for one step"""
+  request_rate: float
+  timestamp_start: float
+  timestamp_end: float
+  num_prompts_attempted: int
+  latencies: List
+  local_metrics: List[MetricSummary]
+  server_metrics: Optional[List[MetricSummary]]
+  errors: Dict[str, int]
+
+class BenchmarkingReport():
+  """Results for all steps for a single model"""
+  args: argparse.Namespace
+  config: BenchmarkConfig
+  steps: List[BenchmarkingStepReport]
+  
+  def __init__(self, args : argparse.Namespace, model: str, start_time: float):
+    self.args = args
+    self.config = BenchmarkConfig(
+      model        = model, 
+      model_server  = args.backend, 
+      start_time    = start_time
+    )
+    self.steps = []
+
+  def record_metrics_for_step(
+      self,
+      request_rate: float, 
+      timestamp_start: float,
+      timestamp_end: float,
+      num_prompts_attempted : int, 
+      latencies: List,
+      errors: Dict[str, int],
+    ):
+    def get_metrics_to_scrape(backend: str) -> List[str]:
+      if backend == "vllm":
+        return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"]
+      elif backend == "jetstream":
+        return [
+          "jetstream_slots_used_percentage",
+          "jetstream_prefill_backlog_size",
+        ]
+      else:
+        return []
+      
+    def metric_sumamry_from_points(name: str, description: str, points : List[float], short_name: Optional[str] = None) -> MetricSummary:
+        mean = np.mean(points) if points else 0
+        median = np.median(points) if points else 0
+        sd = np.std(points) if points else 0
+        min = np.min(points) if points else 0
+        max = np.max(points) if points else 0
+        p90 = np.percentile(points, 90) if points else 0
+        p99 = np.percentile(points, 99) if points else 0
+
+        return MetricSummary(
+          short_name = short_name if short_name is not None else name,
+          name = name,
+          description = description,
+          mean = float(mean),
+          median = float(median),
+          sd = float(sd),
+          min = float(min),
+          max = float(max),
+          p90 = float(p90),
+          p99 = float(p99)
+        ) 
+    
+    total_time = (timestamp_end - timestamp_start)/ NS_IN_SEC
+    if self.args.scrape_server_metrics:
+      server_metrics = fetch_metrics_from_gmp(get_metrics_to_scrape(self.args.backend), total_time, self.args.backend)
+
+    self.steps.append(BenchmarkingStepReport(
+      request_rate = request_rate,
+      timestamp_start = timestamp_start,
+      timestamp_end = timestamp_end,
+      num_prompts_attempted = num_prompts_attempted,
+      latencies = latencies,
+      errors = errors,
+      local_metrics = [
+        metric_sumamry_from_points( 
+          name="per_token_latency", 
+          description="seconds/token (includes waiting time on server)", 
+          points=[latency / (prompt_len + output_len) for prompt_len, output_len, latency in latencies]),
+        metric_sumamry_from_points(
+          name="latency", 
+          description="milliseconds/request (includes waiting time on server)" ,
+          points=[1000 * latency for _, _, latency in latencies]),
+        metric_sumamry_from_points(
+          short_name="tpot", 
+          name="per_output_token_latency", 
+          description="milliseconds/output_token (includes waiting time on server)", 
+          points=[1000 * latency / output_len for _, output_len, latency in latencies]),
+        metric_sumamry_from_points(
+          name="input_length", 
+          description="input length", 
+          points=[float(prompt_len) for prompt_len, _, _ in latencies]),
+        metric_sumamry_from_points(
+          name="output_length", 
+          description="output length", 
+          points=[float(output_len) for _, output_len, _ in latencies]),
+        MetricSummary(
+          name = "throughput",
+          description = "throughput",
+          mean = (len(latencies) / ((timestamp_end - timestamp_start) / NS_IN_SEC)),
+        ),
+      ],
+      server_metrics = server_metrics
+    ))
+
+  # Each element in the output list is a report for each step
+  def to_text_reports(self, write_to_files: bool = False) -> List[str]:
+    output : Dict[str, str] = {}
+    required_stats = ["latency", "throughput", "input_length", "output_length", "per_output_token_latency"]
+    for step in self.steps:
+     if not all(required_stat in [metric['name'] for metric in step['local_metrics']] for required_stat in required_stats):
+        raise Exception(f"All of the following stats must be recorded: {required_stats}")
+     
+    for step in self.steps:
+      step_output : List[str] = []
+      total_time = (step['timestamp_end'] - step['timestamp_start']) / NS_IN_SEC
+      total_output_tokens = np.sum([output_len for _, output_len, _ in step['latencies']])
+      output_tokens_per_second = total_output_tokens / total_time
+      output_tokens_per_min = 60 * output_tokens_per_second
+
+      total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in step['latencies']])
+      input_tokens_per_min = 60 * total_input_tokens / total_time
+
+      total_tokens = total_input_tokens + total_output_tokens
+      tokens_per_min = 60 * total_tokens / total_time
+      step_output.append(f"====Result for Model: {self.config['model']}====")
+      step_output.append(f"Errors: {step['errors']}")
+      step_output.append(f"Total time: {total_time:.2f} s")
+      step_output.append(f"Successful/total requests: {len(step['latencies'])}/{step['num_prompts_attempted']}")
+      step_output.append(f"Requests/min: {60 * step['num_prompts_attempted'] / total_time:.2f}")
+      step_output.append(f"Output_tokens/min: {output_tokens_per_min:.2f}")
+      step_output.append(f"Input_tokens/min: {input_tokens_per_min:.2f}")
+      step_output.append(f"Tokens/min: {tokens_per_min:.2f}")
+
+      if self.args.machine_cost:
+          step_output.append(
+              f"Cost $/1k tokens: {self.args.machine_cost * 1000 / (60 * output_tokens_per_min)}"
+          )
+      for metric in step['local_metrics']:
+        step_output.append(f"Average {metric['description']}:" f" {metric['mean']:.2f}")
+      output_filename = f"latency-profile-{datetime.fromtimestamp(step['timestamp_start'] / NS_IN_SEC).strftime('%Y-%m-%d_%H-%M-%S')}.txt"
+      output[output_filename] = '\n'.join(step_output)
+      if write_to_files:
+        with open(output_filename, 'w') as file:
+          file.write(output[output_filename])
+    return list(output.values())
+
+  # The output is a a single json summary of all steps
+  def to_json_report(self, write_to_file: bool = False) -> Dict:
+    output = {
+      "config": {
+        "num_models":  len(self.args.models) if self.args.save_aggregated_result else 1,
+        "start_time": {
+          "seconds" : self.steps[0]["timestamp_start"] // NS_IN_SEC,
+          "nanos" : self.steps[0]["timestamp_start"] % NS_IN_SEC,
+        },
+        **self.config,
+      },
+      "summary_stats": {
+        "stats": [
+            {
+              "request_rate": step["request_rate"],
+              **{metric["short_name"]: metric for metric in step["local_metrics"] if "short_name" in metric},
+              "model_server_metrics": [
+                  {"name": server_metric["name"], **server_metric}
+                  for server_metric in step["server_metrics"]
+              ] if step["server_metrics"] is not None else []
+            }
+            for step in self.steps
+        ]
+      },
+
+      # Legacy use case, use config if possible
+      "dimensions": {
+        "date": self.args.start_datetime.strftime('%Y%m%d-%H%M%S'),
+        "backend":  self.args.backend,
+        "model_id": self.config['model'],
+        "tokenizer_id": self.args.tokenizer,
+      } if len(self.steps) == 1 else None,
+      # Legacy use case, use summary_stats if possible
+      "metrics" : {
+      # Traffic
+        "num_prompts_attempted": 0,
+        "num_prompts_succeeded": 0,
+        "request_rate": self.steps[0]['request_rate'],
+      } if len(self.steps) == 1 else None,
+    }
+  
+    if write_to_file:
+      model_without_slash = self.config['model'].replace("/","-")
+      file_name = (
+          f"{self.args.file_prefix}-{self.args.backend}-{self.args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json"
+      )
+      with open(file_name, "w", encoding="utf-8") as outfile:
+        json.dump(output, outfile)
+    return output
+
+class Backend(ABC):
+    """
+    An abstract base class for Backend that defines the interface
+    for new model server backends.
+    """
+
+    def request(self):
+      print()
 
-  # Sample the requests.
-  sampled_requests = random.sample(filtered_dataset, num_requests)
-  return sampled_requests
+    @abstractmethod
+    def create_request_payload(self,
+      api_url: str,
+      prompt: str,
+      prompt_len: int,
+      output_len: int,
+      best_of: int,
+      use_beam_search: bool,
+      top_k: int,
+      tokenizer: PreTrainedTokenizerBase,
+      sax_model: str,
+      model: str) -> Dict:
+        pass
 
+    def tokens_from_response(self, response: Dict):
+      return ""
+    
+    @property
+    @abstractmethod
+    def server_metrics(self) -> List[str]:
+      pass
 
-async def get_request(
+    @property
+    @abstractmethod
+    def api_url(self) -> str:
+      pass
+
+class vLLMBackend(Backend):
+  def server_metrics(self) -> List[str]:
+    return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"]
+  def api_url(self) -> str:
+      return "v1/completions"
+  def create_request_payload(self,
+      prompt: str,
+      output_len: int,
+      best_of: int,
+      use_beam_search: bool,
+      model: str):
+    return {
+        "model": model,
+        "prompt": prompt,
+        "n": 1,
+        "best_of": best_of,
+        "use_beam_search": use_beam_search,
+        "temperature": 0.0 if use_beam_search else 1.0,
+        "top_p": 1.0,
+        "max_tokens": output_len,
+        "ignore_eos": False,
+        "stream": False,
+    }
+  def tokens_from_response(self, response : Dict):
+    return response["choices"][0]["text"]
+
+class JetstreamBackend(Backend):
+  def server_metrics(self) -> List[str]:
+    return [
+      "jetstream_slots_used_percentage",
+      "jetstream_prefill_backlog_size",
+    ]
+  def api_url(self) -> str:
+      return ""
+  def create_request_payload(self,
+    prompt: str,
+    output_len: int):
+    return {
+        "prompt": prompt,
+        "max_tokens": output_len,
+    }
+  def tokens_from_response(self, response: Dict):
+    return response["response"]
+
+class TgiBackend(Backend):
+  def server_metrics(self) -> List[str]:
+    return [""]
+  def api_url(self) -> str:
+    return ""
+  def create_request_payload(self,  
+    prompt: str,
+    output_len: int,
+    best_of: int):
+    return {
+      "inputs": prompt,
+      "parameters": {
+        "best_of": best_of,
+        "max_new_tokens": output_len,
+        "do_sample": True,
+      },
+    }
+  def tokens_from_response(self, response: Dict):
+    return response["generated_text"]
+
+class NaiveTransformersBackend(Backend):
+  def server_metrics(self) -> List[str]:
+    return [""]
+  def api_url(self) -> str:
+      return ""
+  def create_request_payload(self,  
+    prompt: str,
+    output_len: int,
+    top_k: int,):
+    return {
+        "instances": [{
+            "prompt": prompt,
+            "max_length": output_len,
+            "top_k": top_k,
+        }]
+    }
+  def tokens_from_response(self, response: Dict):
+    complete_pred = response["predictions"][0][0]["generated_text"]
+    new_text_start_index = complete_pred.find(NEW_TEXT_KEY) + len(NEW_TEXT_KEY)
+    return complete_pred[new_text_start_index:]
+
+class TensorrtLlmTritonBackend(Backend):
+  def server_metrics(self) -> List[str]:
+    return [""]
+  def api_url(self) -> str:
+      return ""
+  def create_request_payload(self,
+    prompt: str,
+    output_len: int,
+    best_of: int,
+    use_beam_search: bool):
+    return {
+        "text_input": prompt,
+        "max_tokens": output_len,
+        "beam_width": 1 if not use_beam_search else best_of,
+        "temperature": 0.0 if use_beam_search else 1.0,
+        "top_p": 1.0,
+        "bad_words": "",
+        "stop_words": "",
+        "stream": False,
+    }
+  def tokens_from_response(self, response: Dict):
+    return response["text_output"]
+
+class SaxBackend(Backend):
+  def server_metrics(self) -> List[str]:
+    return [""]
+  def api_url(self) -> str:
+      return ""
+  def create_request_payload(self,
+    prompt: str,
+    output_len: int,
+    best_of: int,
+    use_beam_search: bool,
+    sax_model: str):
+    return {
+        "model": sax_model,
+        "prompt": prompt,
+        "n": 1,
+        "best_of": best_of,
+        "use_beam_search": use_beam_search,
+        "temperature": 0.0 if use_beam_search else 1.0,
+        "top_p": 1.0,
+        "top_k": 50,
+        "max_tokens": output_len,
+        "stream": False,
+    }
+  def tokens_from_response(self, response: Dict):
+    return response["choices"][0]["text"]
+
+def init_errors_map() -> Dict[str, int]:
+  errors = {
+    "ClientConnectorError": 0,
+    "TimeoutError": 0,
+    "ContentTypeError": 0,
+    "ClientOSError": 0,
+    "ServerDisconnectedError": 0,
+    "unknown_error": 0,
+  }
+  return errors
+
+def getBackend(backend: str) -> Backend:
+  if backend == "vllm":
+    return vLLMBackend()
+  elif backend == "tgi":
+    return TgiBackend()
+  elif backend == "naive_transformers":
+    return NaiveTransformersBackend()
+  elif backend == "tensorrt_llm_triton":
+    return TensorrtLlmTritonBackend()
+  elif backend == "sax":
+    return SaxBackend()
+  elif backend == "jetstream":
+    return JetstreamBackend()
+  else:
+    raise ValueError("Unsupported backend")
+
+
+def fetch_metrics_from_gmp(metrics: List[str], duration: float, backend: str) -> List[MetricSummary]:
+    """Gets summaries for metrics queried from GMP, queries vary per model server"""
+
+    # Creates a credentials object from the default service account file
+    # Assumes that script has appropriate default credentials set up, ref:
+    # https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials
+    credentials, project_id = google.auth.default()
+    # Prepare an authentication request - helps format the request auth token
+    auth_req = google.auth.transport.requests.Request()
+
+    # Request refresh tokens
+    credentials.refresh(auth_req)
+    url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/metadata' % (project_id)
+    headers_api = {'Authorization': 'Bearer ' + credentials.token}
+    request_post = requests.get(url=url, headers=headers_api)
+    all_metrics_metadata = request_post.json()
+    if request_post.ok is not True:
+      print("HTTP Error: %s" % (all_metrics_metadata))
+      return []
+    if all_metrics_metadata["status"] != "success":
+      print("Metadata error response: %s" % all_metrics_metadata["error"])
+      return []
+      
+    metrics_list : List[MetricSummary] = []
+    for metric in metrics:
+      print("Metric Name: %s" % (metric))
+
+     # Find metric type
+      metric_type = all_metrics_metadata['data'][metric]
+      if all_metrics_metadata['data'][metric] is None:
+        print("No metric found for: %s" % metric)
+        return []
+      metric_type = metric_type[0]['type']
+
+      metric_results = {}
+      # Queries scrape all metrics collected from the last $DURATION seconds from the backend's related
+      # podmonitoring spec assumed to be named "$BACKEND-podmonitoring"
+      queries = {
+        "gauge": {
+          "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
+          "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
+          "Sd": "stddev_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
+          "Min": "min_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
+          "Max": "max_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
+          "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
+          "P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
+        },
+      "histogram": {
+          "Mean": "sum(rate(%s_sum{job='%s-podmonitoring'}[%.0fs])) / sum(rate(%s_count{job='%s-podmonitoring'}[%.0fs]))" % (metric, backend, duration, metric, backend, duration),
+          "Median": "histogram_quantile(0.5, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
+          "Min": "histogram_quantile(0, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
+          "Max": "histogram_quantile(1, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
+          "P90": "histogram_quantile(0.9, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
+          "P99": "histogram_quantile(0.99, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
+        }
+      }
+
+      metric_data : MetricSummary = {
+            "name": metric,
+            "description": f"Metrics for {metric} from {backend} backend",
+          }
+      for query_name, query in queries[metric_type].items():
+          
+          # Configure respective query
+          url = f'https://monitoring.googleapis.com/v1/projects/{project_id}/location/global/prometheus/api/v1/query'
+          headers_api = {'Authorization': f'Bearer {credentials.token}'}
+          params = {'query': query}
+          
+          request_post = requests.get(url=url, headers=headers_api, params=params)
+          response = request_post.json()
+
+          # handle response
+          if request_post.ok:
+            if response["status"] == "success":
+              metric_results[query_name] = float(response["data"]["result"][0]["value"][1])
+              print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1]))
+            else:
+              print("Cloud Monitoring PromQL Error: %s" % (response["error"]))
+          else:
+            print("HTTP Error: %s" % (response))
+            
+          # Handle response
+          if request_post.ok and response["status"] == "success":
+              result_value = float(response["data"]["result"][0]["value"][1])
+              if query_name == "Mean":
+                  metric_data["mean"] = result_value
+              elif query_name == "Median":
+                  metric_data["median"] = result_value
+              elif query_name == "Sd":
+                  metric_data["sd"] = result_value
+              elif query_name == "Min":
+                  metric_data["min"] = result_value
+              elif query_name == "Max":
+                  metric_data["max"] = result_value
+              elif query_name == "P90":
+                  metric_data["p90"] = result_value
+              elif query_name == "P99":
+                  metric_data["p99"] = result_value
+          else:
+              error_message = response.get("error", "HTTP Error")
+              print(f"Error fetching {query_name} for {metric}: {error_message}")
+        
+      metrics_list.append(metric_data)
+    return metrics_list
+
+async def generate_next_request(
     input_requests: List[Tuple[str, int, int]],
     request_rate_expr: str,
     start_time: float,
 ) -> AsyncGenerator[Tuple[str, int, int], None]:
   """Gets request async."""
-  for request in input_requests:
+  request = random.choice(input_requests)
+  while True:
     yield request
 
     if request_rate_expr == "oo":
@@ -120,17 +584,6 @@ async def get_request(
     # The next request will be sent after the interval.
     await asyncio.sleep(interval)
 
-def init_errors_map() -> Dict[str, int]:
-  errors = {
-    "ClientConnectorError": 0,
-    "TimeoutError": 0,
-    "ContentTypeError": 0,
-    "ClientOSError": 0,
-    "ServerDisconnectedError": 0,
-    "unknown_error": 0,
-  }
-  return errors
-
 async def send_request(
     backend: str,
     api_url: str,
@@ -163,7 +616,6 @@ async def send_request(
         "stream": False,
     }
   elif backend == "tgi":
-    assert not use_beam_search
     params = {
         "best_of": best_of,
         "max_new_tokens": output_len,
@@ -221,7 +673,7 @@ async def send_request(
     while True:
       try:
         async with session.post(api_url, headers=headers, json=pload, ssl=False) as response:
-          output = await response.json()
+           output = await response.json()
 
         # Re-send the request if it failed.
         if "error" not in output:
@@ -285,316 +737,178 @@ async def send_request(
 
   return request_latency, None
 
+def get_filtered_dataset(
+    dataset_path: str,
+    max_input_len: int,
+    max_output_len: int,
+    tokenizer: PreTrainedTokenizerBase,
+    use_dummy_text: bool,
+) -> List[Tuple[str, int, int]]:
+    """Gets a subset of the dataset where all elements adhere to the specified constraints"""
+    if use_dummy_text:
+      dummy_prompt_token_ids = [0] * max_input_len
+      dummy_prompt = tokenizer.decode(dummy_prompt_token_ids)
+      return [(
+          dummy_prompt,
+          max_input_len,
+          max_output_len,
+      )]
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+      dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [
+        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+        for data in dataset
+    ]
+
+    # Tokenize the prompts and completions.
+    prompts = [prompt for prompt, _ in dataset]
+    prompt_token_ids = tokenizer(prompts).input_ids
+    completions = [completion for _, completion in dataset]
+    completion_token_ids = tokenizer(completions).input_ids
+    tokenized_dataset = []
+    for i in range(len(dataset)):
+      output_len = len(completion_token_ids[i])
+      tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+
+    # Filter out too long sequences.
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for prompt, prompt_token_ids, output_len in tokenized_dataset:
+      prompt_len = len(prompt_token_ids)
+      if prompt_len < MIN_SEQ_LEN or output_len < MIN_SEQ_LEN:
+        # Prune too short sequences.
+        # This is because TGI causes errors when the input or output length
+        # is too short.
+        continue
+      if prompt_len > max_input_len or output_len > max_output_len:
+        # Prune too long sequences.
+        continue
+      filtered_dataset.append((prompt, prompt_len, output_len))
+
+    return filtered_dataset
+
 async def benchmark(
     args: argparse.Namespace, 
     api_url: str,
     tokenizer: PreTrainedTokenizerBase,
     model: str,
-) -> Tuple[List[Tuple[int, int, float]], Dict[str, int]]:
+) -> BenchmarkingReport:
   """Runs benchmark with asynchronous requests."""
-  input_requests = sample_requests(
+  input_requests = get_filtered_dataset(
       args.dataset,
-      args.num_prompts,
       args.max_input_length,
       args.max_output_length,
       tokenizer,
       args.use_dummy_text,
   )
-  benchmark_start_time = time.time_ns()
-  tasks: List[asyncio.Task] = []
-  async for request in get_request(input_requests, args.request_rate, benchmark_start_time):
-    prompt, prompt_len, output_len = request
-    task = asyncio.create_task(
-        send_request(
-            args.backend,
-            api_url,
-            prompt,
-            prompt_len,
-            output_len,
-            args.best_of,
-            args.use_beam_search,
-            args.top_k,
-            tokenizer,
-            args.sax_model,
-            model,
-        )
-    )
-    tasks.append(task)
-  results = await asyncio.gather(*tasks)
-  combined_latencies = []
-  combined_errors = init_errors_map()
-  for latency, errors in results:
-    if latency:
-      combined_latencies.append(latency)
-    if errors:
-      for err, count in errors.items():
-        combined_errors[err] = combined_errors[err] + count
+  benchmark_results = BenchmarkingReport(args, model, time.time_ns())
+
+  all_steps = {}
+  if args.job is not None:
+    all_steps = args.job
+  elif args.num_prompts is not None:
+    all_steps = {
+      "steps": [{
+        "rate": args.request_rate,
+        "max_num_prompts": args.num_prompts,
+      }]
+    }
   
-  benchmark_duration = (time.time_ns() - benchmark_start_time) / NS_IN_SEC
-  print_and_save_result(args, benchmark_duration, len(input_requests), model, combined_latencies, combined_errors)
-  return combined_latencies, combined_errors
+  for index, step in enumerate(all_steps["steps"]):
+  
+    # No need to sleep before running the first step
+    if 'time_between_steps' in args.job and index != 0:
+      print(f"Sleeping for {args.job['time_between_steps']} sec...")
+      await asyncio.sleep(args.job["time_between_steps"])
+    max_prompts = f" {step['max_num_prompts']} requests" if 'max_num_prompts' in step else " "
+    duration = f" {step['time']} sec" if 'time' in step else " "
+    print(f"Starting benchmarking{max_prompts} at {step['rate']} requests/sec for{duration}")
 
+    tasks: List[asyncio.Task] = []
+    prompts_sent_this_step: int = 0
+    step_start_timestamp = time.time_ns()
+    async for request in generate_next_request(input_requests, str(step["rate"]), step_start_timestamp):
+      # Stop conditions
+      if "max_num_prompts" in step and prompts_sent_this_step >= step["max_num_prompts"]:
+        break
+      if "time" in step and ((time.time_ns() - step_start_timestamp ) / NS_IN_SEC) > step["time"]:
+        break
 
-def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics, model, errors):
-  # Setup
-  start_dt_proto = Timestamp()
-  start_dt_proto.FromDatetime(args.start_datetime)
+      prompt, prompt_len, output_len = request
+      task = asyncio.create_task(
+          send_request(
+              args.backend,
+              api_url,
+              prompt,
+              prompt_len,
+              output_len,
+              args.best_of,
+              args.use_beam_search,
+              args.top_k,
+              tokenizer,
+              args.sax_model,
+              model,
+          )
+      )
+      tasks.append(task)
+      prompts_sent_this_step += 1
 
-  final_json = {
-    # metrics values are numerical
-    "metrics" : {
-      # Traffic
-      "num_prompts_attempted": benchmark_result['num_prompts_attempted'],
-      "num_prompts_succeeded": benchmark_result['num_prompts_succeeded'],
-      "request_rate": args.request_rate,
-      'server_metrics': {
-        **server_metrics
-      },
-      **benchmark_result,
-      **errors,
-    },
-    # dimensions values are strings
-    "dimensions": {
-      "date": args.start_datetime.strftime('%Y%m%d-%H%M%S'),
-      "backend": args.backend,
-      "model_id": model,
-      "tokenizer_id": args.tokenizer,
-      **(json.loads(args.additional_metadata_metrics_to_save) if args.additional_metadata_metrics_to_save else {})
-    },
-    "config": {
-      "model": model,
-      "num_models": len(args.models.split(',')),
-      "model_server": args.backend,
-      "start_time": {
-        "seconds" : start_dt_proto.seconds,
-        "nanos" : start_dt_proto.nanos
-      }
-    },
-    "summary_stats": {
-      "stats": [{
-        "request_rate": args.request_rate,
-        "request_latency": {
-          "mean": benchmark_result["avg_latency"],
-          "median": benchmark_result["median_latency"],
-          "sd": benchmark_result["sd_latency"],
-          "min": benchmark_result["min_latency"],
-          "max": benchmark_result["max_latency"],
-          "p90": benchmark_result["p90_latency"],
-          "p99": benchmark_result["p99_latency"],
-        },
-        "throughput": {
-          "mean": benchmark_result['throughput']
-        },
-        "input_length": {
-          "mean": benchmark_result["avg_input_len"],
-          "median": benchmark_result["median_input_len"],
-          "sd": benchmark_result["sd_input_len"],
-          "min": benchmark_result["min_input_len"],
-          "max": benchmark_result["max_input_len"],
-          "p90": benchmark_result["p90_input_len"],
-          "p99": benchmark_result["p99_input_len"],
-        },
-        "output_length": {
-          "mean": benchmark_result["avg_output_len"],
-          "median": benchmark_result["median_output_len"],
-          "sd": benchmark_result["sd_output_len"],
-          "min": benchmark_result["min_output_len"],
-          "max": benchmark_result["max_output_len"],
-          "p90": benchmark_result["p90_output_len"],
-          "p99": benchmark_result["p99_output_len"],
-        },
-        "tpot": {
-          "mean": benchmark_result["avg_per_output_token_latency"],
-          "median": benchmark_result["median_per_output_token_latency"],
-          "sd": benchmark_result["sd_per_output_token_latency"],
-          "min": benchmark_result["min_per_output_token_latency"],
-          "max": benchmark_result["max_per_output_token_latency"],
-          "p90": benchmark_result["p90_per_output_token_latency"],
-          "p99": benchmark_result["p99_per_output_token_latency"],
-        },
-        "model_server_metrics" : [{"Name": name, **metrics} for name, metrics in server_metrics.items()]
-      }]
-    }
-  }
+    print("All requests sent, awaiting responses...")
+    results = await asyncio.gather(*tasks)
+    step_end_timestamp = time.time_ns()
+    print(f"Finished benchmarking step {index + 1}")
+
+    all_latencies = []
+    all_errors = init_errors_map()
+    for latency, errors in results:
+      if latency:
+        all_latencies.append(latency)
+      if errors:
+        for err, count in errors.items():
+          all_errors[err] = all_errors[err] + count
+    benchmark_results.record_metrics_for_step(step['rate'], step_start_timestamp, step_end_timestamp, prompts_sent_this_step, all_latencies, all_errors)
   
-  # Save to file
-  model_without_slash = model.replace("/","-")
-  file_name = (
-      f"{args.file_prefix}-{args.backend}-{args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json"
-  )
-  with open(file_name, "w", encoding="utf-8") as outfile:
-    json.dump(final_json, outfile)
+  print(f"Completed all steps, generating reports...")
+  return benchmark_results
 
-def metrics_to_scrape(backend: str) -> List[str]:
-  # Each key in the map is a metric, it has a corresponding 'stats' object
-  # It must be populated on the outputs 'metrics' field as 'key':'stats'
-  # If a value is specified for a given key, it will be populated on the outputs `summary_stats.stats` field as 'value':'stats' as well.
-  if backend == "vllm":
-    return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"]
-  elif backend == "jetstream":
-    return [
-      "jetstream_slots_used_percentage",
-      "jetstream_prefill_backlog_size",
-    ]
-  else:
-    return []
-
-def print_metrics(metrics: List[str], duration: float, backend: str):
-  # Creates a credentials object from the default service account file
-  # Assumes that script has appropriate default credentials set up, ref:
-  # https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials
-  credentials, project_id = google.auth.default()
-  # Prepare an authentication request - helps format the request auth token
-  auth_req = google.auth.transport.requests.Request()
-
-  server_metrics = {}
-
-  # Request refresh tokens
-  credentials.refresh(auth_req)
-  url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/metadata' % (project_id)
-  headers_api = {'Authorization': 'Bearer ' + credentials.token}
-  request_post = requests.get(url=url, headers=headers_api)
-  all_metrics_metadata = request_post.json()
-  if request_post.ok is not True:
-    print("HTTP Error: %s" % (all_metrics_metadata))
-  if all_metrics_metadata["status"] != "success":
-    print("Metadata error response: %s" % all_metrics_metadata["error"])
-
-  for metric in metrics:
-    print("Metric Name: %s" % (metric))
-
-    # Find metric type
-    metric_type = all_metrics_metadata['data'][metric]
-    if all_metrics_metadata['data'][metric] is None:
-      print("No metric found for: %s" % metric)
-      return
-    metric_type = metric_type[0]['type']
-
-    metric_results = {}
-    # Queries scrape all metrics collected from the last $DURATION seconds from the backend's related
-    # podmonitoring spec assumed to be named "$BACKEND-podmonitoring"
-    queries = {
-      "gauge": {
-        "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
-        "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
-        "Sd": "stddev_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
-        "Min": "min_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
-        "Max": "max_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
-        "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
-        "P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
-    },
-      "histogram": {
-        "Mean": "sum(rate(%s_sum{job='%s-podmonitoring'}[%.0fs])) / sum(rate(%s_count{job='%s-podmonitoring'}[%.0fs]))" % (metric, backend, duration, metric, backend, duration),
-        "Median": "histogram_quantile(0.5, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
-        "Min": "histogram_quantile(0, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
-        "Max": "histogram_quantile(1, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
-        "P90": "histogram_quantile(0.9, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
-        "P99": "histogram_quantile(0.99, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
-    }
-  }
-    for query_name, query in queries[metric_type].items():
-      # Configure respective query
-      url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/query' % (project_id)
-      headers_api = {'Authorization': 'Bearer ' + credentials.token}
-      params = {'query': query}
-      request_post = requests.get(url=url, headers=headers_api, params=params)
-      response = request_post.json()
-
-      # handle response
-      if request_post.ok:
-        if response["status"] == "success":
-          metric_results[query_name] = float(response["data"]["result"][0]["value"][1])
-          print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1]))
-        else:
-          print("Cloud Monitoring PromQL Error: %s" % (response["error"]))
-      else:
-        print("HTTP Error: %s" % (response))
-    server_metrics[metric] = metric_results
-  return server_metrics
-
-def get_stats_for_set(name, description, points):
-  avg = np.mean(points) if points else 0
-  median = np.median(points) if points else 0
-  sd = np.std(points) if points else 0
-  min = np.min(points) if points else 0
-  max = np.max(points) if points else 0
-  p90 = np.percentile(points, 90) if points else 0
-  p99 = np.percentile(points, 99) if points else 0
-
-  print(f"Average {description}:" f" {avg:.2f}")
-
-  return {
-    f'avg_{name}':  avg,
-    f'median_{name}': median,
-    f'sd_{name}': sd,
-    f'min_{name}': min,
-    f'max_{name}': max,
-    f'p90_{name}': p90,
-    f'p99_{name}': p99,
+def aggregate_benchmark_reports(reports: List[BenchmarkingReport]) -> BenchmarkingReport: 
+  """When benchmarking multiple models we will generate a BenchmarkingReport for each."""
+  """If `save_aggregated_result` is set, we aggregate these into a single report."""
+
+  aggregated_step_report = {
+    "request_rate":  reports[0].steps[0]["request_rate"],
+    "timestamp_start": 0.0,
+    "timestamp_end": 0.0,
+    "num_prompts_attempted": 0,
+    "latencies": [],
+    "server_metrics": [],
+    "errors": {},
   }
 
-def print_and_save_result(args: argparse.Namespace, benchmark_duration, total_requests, model, request_latencies, errors):
-  benchmark_result = {}
-
-  print(f"====Result for Model: {model}====")
-  print(f"Errors: {errors}")
-  print(f"Total time: {benchmark_duration:.2f} s")
-  print(f"Successful/total requests: {len(request_latencies)}/{total_requests}")
-  print(f"Requests/min: {60 * total_requests / benchmark_duration:.2f}")
-  benchmark_result["num_prompts_attempted"] = total_requests
-  benchmark_result["num_prompts_succeeded"] = len(request_latencies)
-  benchmark_result['benchmark_time'] = benchmark_duration
-  benchmark_result['throughput_rps'] = (args.num_prompts / benchmark_duration)
-
-  total_output_tokens = np.sum([output_len for _, output_len, _ in
-                                request_latencies])
-  output_tokens_per_second = total_output_tokens / benchmark_duration
-  benchmark_result['throughput'] = output_tokens_per_second
-
-  output_tokens_per_min = 60 * output_tokens_per_second
-  print(f"Output_tokens/min: {output_tokens_per_min:.2f}")
-  benchmark_result['total_output_token'] = int(total_output_tokens)
-  benchmark_result['output_tokens_per_min'] = output_tokens_per_min
-
-  total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in
-                               request_latencies])
-  input_tokens_per_min = 60 * total_input_tokens / benchmark_duration
-  print(f"Input_tokens/min: {input_tokens_per_min:.2f}")
-  benchmark_result['total_input_tokens'] = int(total_input_tokens)
-  benchmark_result['input_tokens_per_min'] = input_tokens_per_min
-
-  total_tokens = total_input_tokens + total_output_tokens
-  tokens_per_min = 60 * total_tokens / benchmark_duration
-  print(f"Tokens/min: {tokens_per_min:.2f}")
-  benchmark_result['total_tokens'] = int(total_tokens)
-  benchmark_result['tokens_per_min'] = tokens_per_min
-
-  if args.machine_cost:
-    print(
-        "Cost $/1k tokens:"
-        f" {args.machine_cost * 1000 / (60 * output_tokens_per_min)}"
-    )
+  def accumulate_errors(errors_list: List[Dict[str, int]]) -> Dict[str, int]:
+    accumulated_errors = init_errors_map()
+    for errors in errors_list:
+        for error_type, count in errors.items():
+            accumulated_errors[error_type] += count
+    return accumulated_errors
 
-  benchmark_result = {
-    **benchmark_result,
-    **(get_stats_for_set("per_token_latency", "seconds/token (includes waiting time on server)", [
-      latency / (prompt_len + output_len)
-      for prompt_len, output_len, latency in request_latencies
-    ])),
-
-    # NOTE: The latency below includes requests awaiting time on server side.
-    # It's not comparable with the model inference latency for batch size 1.
-    **(get_stats_for_set("latency", "milliseconds/request (includes waiting time on server)" ,[1000 * latency for _, _, latency in request_latencies])),
-    **(get_stats_for_set("per_output_token_latency", "milliseconds/output_token (includes waiting time on server)", [1000 * latency / output_len for _, output_len, latency in request_latencies])),
-    **(get_stats_for_set("input_len", "input length", [float(prompt_len) for prompt_len, _, _ in request_latencies])),
-    **(get_stats_for_set("output_len", "output length", [float(output_len) for _, output_len, _ in request_latencies]))
-  }
+  for report in reports:
+    # Input metavalidation asserts this report only has one step report
+    report = report.steps[0]
+    aggregated_step_report["timestamp_start"] = min(aggregated_step_report["timestamp_start"], report["timestamp_start"])
+    aggregated_step_report["timestamp_end"] = max(aggregated_step_report["timestamp_end"], report["timestamp_end"])
+    aggregated_step_report["num_prompts_attempted"] += report["num_prompts_attempted"]
+    aggregated_step_report["latencies"].extend(report["latencies"])
+    aggregated_step_report["errors"] = accumulate_errors([aggregated_step_report["errors"], report["errors"]])
 
-  server_metrics = {}
-  if args.scrape_server_metrics:
-    server_metrics = print_metrics(metrics_to_scrape(args.backend), benchmark_duration, args.backend)
-  if args.save_json_results:
-    save_json_results(args, benchmark_result, server_metrics, model, errors)
+  aggregated_report = BenchmarkingReport(reports[0].args, f"ALL-{len(reports)}-MODELS", aggregated_step_report["timestamp_start"])
+  aggregated_report.record_metrics_for_step(**aggregated_step_report)
+
+  return aggregated_report
 
 async def main(args: argparse.Namespace):
   print(args)
@@ -606,24 +920,7 @@ async def main(args: argparse.Namespace):
     "v1/completions"
     if args.backend == "vllm"
     else args.endpoint
-)
-
-  # Input assertions
-  def is_expression_of_t(expression):
-    # Check if expression uses variables other than 't' by attempting to evaluate with only 't' defined
-    try:
-        t = symbols('t')
-        expr_parsed = parse_expr(expression, transformations="all", local_dict={"t": t})
-        expr_parsed.subs(t, 1)
-        return True
-    except KeyError as e:
-        # If another variable is required, throw a KeyError
-        return False
-
-  if args.request_rate == "inf":
-    args.request_rate = "oo"
-  if not is_expression_of_t(args.request_rate):
-      raise ValueError(f"Request rate {args.request_rate}, must be an expression of `t`")
+  )
 
   print(f"Starting Prometheus Server on port {PROMETHEUS_PORT}")
   start_http_server(PROMETHEUS_PORT)
@@ -632,33 +929,33 @@ def is_expression_of_t(expression):
   tokenizer = AutoTokenizer.from_pretrained(
       args.tokenizer, trust_remote_code=args.trust_remote_code
   )
-
-  benchmark_start_time = time.time_ns()
-  args.start_datetime = datetime.fromtimestamp(benchmark_start_time / NS_IN_SEC)
+  args.start_datetime = datetime.fromtimestamp(time.time_ns() / NS_IN_SEC)
   
-  results = await asyncio.gather(
-            *[benchmark(args, api_url, tokenizer, model) for model in models]
-        )
-  
-  # Summarize results
-  combined_latencies = []
-  combined_errors = {
-    "ClientConnectorError": 0,
-    "TimeoutError": 0,
-    "ContentTypeError": 0,
-    "ClientOSError": 0,
-    "unknown_error": 0,
-    "ServerDisconnectedError": 0,
-  }
-  for latencies, errors in results:
-    combined_latencies.extend(latencies)
-    for k, v in errors.items():
-      combined_errors[k] = combined_errors[k] + v
-  
-  benchmark_duration_all_models = (time.time_ns() - benchmark_start_time) / NS_IN_SEC
+  reports : List[BenchmarkingReport] = await asyncio.gather(
+    *[benchmark(args, api_url, tokenizer, model) for model in models]
+  )
+
   if args.save_aggregated_result:
-    print_and_save_result(args, benchmark_duration_all_models, len(models)*args.num_prompts, f"ALL-{len(models)}-MODELS", combined_latencies, combined_errors)
+    aggregated_benchmark = aggregate_benchmark_reports(reports)
+    aggregated_benchmark.to_text_reports(write_to_files=True)
+    aggregated_benchmark.to_json_report(write_to_file=args.save_json_results)
+  else:
+    for report in reports:
+      report.to_text_reports(write_to_files=True)
+      report.to_json_report(write_to_file=args.save_json_results)
+  
+def input_metavalidation(args: argparse.Namespace):
+  """Validate a correct combination of arguments is set"""
+
+  if sum([bool(args.request_rate is not None and args.num_prompts is not None), bool(args.job is not None)]) != 1:
+    raise ValueError("All args must be set for one and only one of the following sets of arguments: {--request-rate, --num-prompts} or {--job}")
 
+  if args.save_aggregated_result and args.benchmark is not None and len(args.benchmark) != 1 and args.models is not None and len(args.models) > 1:
+      raise ValueError("Multi model benchmarking with multi step benchmarking is not supported yet")
+
+  if args.use_beam_search and args.backend == "tgi":
+    raise ValueError("Beam search is not supported by TGI")
+  
 if __name__ == "__main__":
   parser = argparse.ArgumentParser(
       description="Benchmark the online serving throughput."
@@ -683,7 +980,6 @@ def is_expression_of_t(expression):
       help="Model name to send request to at API server for SAX model server.",
   )
   parser.add_argument("--file-prefix", type=str, default="benchmark")
-  parser.add_argument("--endpoint", type=str, default="generate")
   parser.add_argument("--host", type=str, default="localhost")
   parser.add_argument("--port", type=int, default=7080)
   parser.add_argument("--dataset", type=str, help="Path to the dataset.")
@@ -708,7 +1004,7 @@ def is_expression_of_t(expression):
   parser.add_argument(
       "--num-prompts",
       type=int,
-      default=1000,
+      default=None,
       help="Number of prompts to process.",
   )
   parser.add_argument(
@@ -737,10 +1033,24 @@ def is_expression_of_t(expression):
           " LLaMA2 models."
       ),
   )
+
+  # Input assertions
+  def is_expression_of_t(input_str):
+    if input_str == "inf":
+      return "oo"
+    # Check if expression uses variables other than 't' by attempting to evaluate with only 't' defined
+    try:
+      t = symbols('t')
+      expr_parsed = parse_expr(input_str, transformations="all", local_dict={"t": t})
+      expr_parsed.subs(t, 1)
+      return input_str
+    except Exception:
+      raise ValueError(f"Request rate {input_str}, must be an expression of `t`")
+    
   parser.add_argument(
       "--request-rate",
-      type=str,
-      default="inf",
+      type=is_expression_of_t,
+      default=None,
       help=(
           "Specifies the request rate as a function of time, f(t)."
           " Example format: '1+1.05*t', where 't' represents seconds from"
@@ -750,6 +1060,80 @@ def is_expression_of_t(expression):
           " expression."
     ),
   )
+
+  def parse_request_rates(input_str):
+      if input_str is None:
+          return None
+      # Check if input is a filename and load its contents
+      if os.path.isfile(input_str):
+          with open(input_str, 'r') as file:
+              input_str = file.read()
+      print(input_str)
+      try:
+          # Parse the input string as JSON
+          request_data = json.loads(input_str)
+          # Validate that the JSON has the correct structure
+          if not isinstance(request_data, dict):
+              raise argparse.ArgumentTypeError("Input JSON must be an object containing 'time_between_steps' and 'steps'.")
+          # Check 'time_between_steps' field
+          if "time_between_steps" not in request_data or (not isinstance(request_data["time_between_steps"], float) and not isinstance(request_data["time_between_steps"], int)):
+              raise argparse.ArgumentTypeError("'time_between_steps' must be a float or int.")
+          # Check 'steps' field
+          if "steps" not in request_data or not isinstance(request_data["steps"], list):
+              raise argparse.ArgumentTypeError("'steps' must be a list of objects with 'rate' and 'time'.")
+          
+          # Validate each entry in the 'steps' list
+          for i, rate_entry in enumerate(request_data["steps"]):
+            if not isinstance(rate_entry, dict):
+                raise argparse.ArgumentTypeError(f"Entry {i} in 'steps' must be a JSON object.")
+            
+            if "rate" not in rate_entry:
+                raise argparse.ArgumentTypeError(f"Entry {i} in 'steps' must have a 'rate' key.")
+            if "time" not in rate_entry  and "max_num_prompts" not in rate_entry:
+                raise argparse.ArgumentTypeError(f"Entry {i} in 'steps' must have a 'time' and/or 'max_num_prompts' key.")
+
+            # Validate the 'rate' field to allow for string expressions or floats
+            if isinstance(rate_entry["rate"], str):
+                try:
+                  is_expression_of_t(rate_entry["rate"])  # Validate the expression
+                except Exception as e:
+                  raise argparse.ArgumentTypeError(f"Entry {i} in 'steps': {e}")
+            # Validate the 'time' field
+            if not isinstance(rate_entry["time"], (float, int)):
+                raise argparse.ArgumentTypeError(f"Entry {i} in 'steps': 'time' must be a positive float.")
+          return request_data
+      except json.JSONDecodeError as e:
+          raise argparse.ArgumentTypeError("Invalid JSON format")
+
+  parser.add_argument(
+      "--job",
+      type=parse_request_rates,
+      default=None,
+      required=False,
+      help=(
+          "Specify the benchmark procedure in JSON format, either as raw JSON"
+          " or as a filename. \n"
+          " The JSON should have the following structure:\n\n"
+          "     {\n"
+          "         \"time_between_steps\": float (seconds to rest between rates),\n" 
+          "         \"rates\": [\n"
+          "             {\n"
+          "                 \"rate\": float | str (as would be passed to request-rate),\n"
+          "                 \"time\": float (number of seconds for this step)\n"
+          "                 \"max_num_prompts\": int (maximum number of prompts for this step)"
+          "             },\n"
+          "             ...\n"
+          "         ]\n"
+          "     }\n\n"
+          " Example JSON:\n"
+          "     '{\"time_between_steps\": 1.0, \"rates\": [{\"rate\": 2.0, \"time\": 0.0}, {\"rate\": \"1+0.5*t\", \"time\": 5.0}]}'\n\n"
+          " Each entry should have a 'rate' and/or 'num_prompts' and 'time' value."
+          " Each rate is finished when \"num_prompts\" prompts are sent"
+          " (if specified) and \"time\" seconds have passed (if specified),"
+          " whichever comes last"
+      ),
+  )
+
   parser.add_argument("--seed", type=int, default=int(time.time()))
   parser.add_argument(
       "--trust-remote-code",
@@ -794,4 +1178,5 @@ def is_expression_of_t(expression):
       help="Whether to scrape server metrics.",
   )
   cmd_args = parser.parse_args()
+  input_metavalidation(cmd_args)
   asyncio.run(main(cmd_args))
\ No newline at end of file

From 82425125dcde6de3fe7f5f59f42b7c11f41ce86e Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Mon, 11 Nov 2024 23:57:51 +0000
Subject: [PATCH 14/27] intermediate change

---
 .../container/benchmark_serving.py            | 267 ++++++++++++++----
 1 file changed, 206 insertions(+), 61 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index f7c08cc26..8cd98e956 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -5,7 +5,7 @@
 It currently supports TGI, vLLM, Triton TensorRT-LLM and Saxml.
 """
 
-from abc import ABC, abstractmethod
+from abc import ABC, abstractmethod, abstractproperty
 import argparse
 import asyncio
 from datetime import datetime
@@ -263,12 +263,91 @@ class Backend(ABC):
     for new model server backends.
     """
 
-    def request(self):
-      print()
+    async def send_request(
+        self,
+        api_url: str,
+        prompt: str,
+        prompt_len: int,
+        output_len: int,
+        best_of: int,
+        use_beam_search: bool,
+        top_k: int,
+        tokenizer: PreTrainedTokenizerBase,
+        sax_model: str,
+        model: str,
+    ) -> Tuple[Optional[Tuple[int, int, float]], Optional[Dict[str, int]]]:
+      """Sends request to server."""
+      request_start_time = time.time()
+      errors = init_errors_map()
+
+      headers = {"User-Agent": "Benchmark Client"}
+      pload = self.create_request_payload(
+        prompt=prompt,
+        prompt_len=prompt_len,
+        output_len=output_len,
+        best_of=best_of,
+        use_beam_search=use_beam_search,
+        top_k=top_k,
+        tokenizer=tokenizer,
+        sax_model=sax_model,
+        model=model,
+      )
+      
+      # Set client timeout to be 3 hrs.
+      timeout = aiohttp.ClientTimeout(total=CLIENT_TIMEOUT_SEC)
+      async with aiohttp.ClientSession(timeout=timeout,trust_env=True) as session:
+        while True:
+          try:
+            async with session.post(f"{api_url}/{self.get_endpoint()}", headers=headers, json=pload, ssl=False) as response:
+              output = await response.json()
+
+            # Re-send the request if it failed.
+            if "error" not in output:
+              break
+          except aiohttp.client_exceptions.ClientConnectorError as client_err:
+            errors["ClientConnectorError"] += 1
+            print(f"ClientConnectorError: {client_err}")
+            return None, errors
+          except asyncio.TimeoutError as timeout_err:
+            errors["TimeoutError"] += 1
+            print(f"TimeoutError: {timeout_err}")
+            return None, errors
+          except aiohttp.client_exceptions.ClientOSError as e:
+            errors["ClientOSError"] += 1
+            print(f"ClientOSError: {e}")
+            return None, errors
+          except aiohttp.client_exceptions.ContentTypeError as e:
+            print(f"ContentTypeError: {e}, response: {response}")
+            errors["ContentTypeError"] += 1
+            return None, errors
+          except aiohttp.client_exceptions.ServerDisconnectedError as e:
+            errors["ServerDisconnectedError"] += 1
+            print(f"ServerDisconnectedError: {e}")
+            return None, errors
+          except Exception as e: 
+            print(f"Unknown error {e}")
+            errors["unknown_error"] += 1
+            return None, errors
+      request_end_time = time.time()
+      # Naive HF transformers generation and TensorRT-LLM generation stops at EOS
+      # tokens and the generation may be shorter than the ground-truth output
+      # sequence length.
+      output_len = self.get_response_length(
+        response=output,
+        request_len=prompt_len,
+        tokenizer=tokenizer
+      )
+
+      # (prompt len, output len, latency, success)
+      request_latency = (prompt_len, output_len, (request_end_time - request_start_time))
+      tpot_metric.observe((request_end_time - request_start_time) / output_len)
+      prompt_length_metric.observe(prompt_len)
+      response_length_metric.observe(output_len)
+
+      return request_latency, None
 
     @abstractmethod
     def create_request_payload(self,
-      api_url: str,
       prompt: str,
       prompt_len: int,
       output_len: int,
@@ -280,29 +359,36 @@ def create_request_payload(self,
       model: str) -> Dict:
         pass
 
-    def tokens_from_response(self, response: Dict):
-      return ""
+    @abstractmethod
+    def get_response_length(
+        self, 
+        request_len: int, 
+        response: Dict, 
+        tokenizer: PreTrainedTokenizerBase) -> int:
+      pass
     
-    @property
     @abstractmethod
-    def server_metrics(self) -> List[str]:
+    def get_server_metrics(self) -> List[str]:
       pass
 
-    @property
     @abstractmethod
-    def api_url(self) -> str:
+    def get_endpoint(self) -> str:
       pass
 
 class vLLMBackend(Backend):
-  def server_metrics(self) -> List[str]:
+  def get_server_metrics(self) -> List[str]:
     return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"]
-  def api_url(self) -> str:
+  def get_endpoint(self) -> str:
       return "v1/completions"
   def create_request_payload(self,
       prompt: str,
+      prompt_len: int,
       output_len: int,
       best_of: int,
       use_beam_search: bool,
+      top_k: int,
+      tokenizer: PreTrainedTokenizerBase,
+      sax_model: str,
       model: str):
     return {
         "model": model,
@@ -316,36 +402,60 @@ def create_request_payload(self,
         "ignore_eos": False,
         "stream": False,
     }
-  def tokens_from_response(self, response : Dict):
-    return response["choices"][0]["text"]
+  def get_response_length(
+        self, 
+        request_len: int, 
+        response: Dict, 
+        tokenizer: PreTrainedTokenizerBase):
+    print(response)
+    output_token_ids = tokenizer(response["choices"][0]["text"]).input_ids
+    return len(output_token_ids)
 
 class JetstreamBackend(Backend):
-  def server_metrics(self) -> List[str]:
+  def get_server_metrics(self) -> List[str]:
     return [
       "jetstream_slots_used_percentage",
       "jetstream_prefill_backlog_size",
     ]
-  def api_url(self) -> str:
+  def get_endpoint(self) -> str:
       return ""
   def create_request_payload(self,
-    prompt: str,
-    output_len: int):
+      prompt: str,
+      prompt_len: int,
+      output_len: int,
+      best_of: int,
+      use_beam_search: bool,
+      top_k: int,
+      tokenizer: PreTrainedTokenizerBase,
+      sax_model: str,
+      model: str):
     return {
         "prompt": prompt,
         "max_tokens": output_len,
     }
-  def tokens_from_response(self, response: Dict):
-    return response["response"]
+  def get_response_length(
+        self, 
+        request_len: int, 
+        response: Dict, 
+        tokenizer: PreTrainedTokenizerBase):
+    output_token_ids = tokenizer(response["response"]).input_ids
+    return len(output_token_ids)
 
 class TgiBackend(Backend):
-  def server_metrics(self) -> List[str]:
+  def get_server_metrics(self) -> List[str]:
     return [""]
-  def api_url(self) -> str:
+  def get_endpoint(self) -> str:
     return ""
-  def create_request_payload(self,  
-    prompt: str,
-    output_len: int,
-    best_of: int):
+  def create_request_payload(self,
+      prompt: str,
+      prompt_len: int,
+      output_len: int,
+      best_of: int,
+      use_beam_search: bool,
+      top_k: int,
+      tokenizer: PreTrainedTokenizerBase,
+      sax_model: str,
+      model: str):
     return {
       "inputs": prompt,
       "parameters": {
@@ -354,18 +464,29 @@ def create_request_payload(self,
         "do_sample": True,
       },
     }
-  def tokens_from_response(self, response: Dict):
-    return response["generated_text"]
+  def get_response_length(
+        self, 
+        request_len: int, 
+        response: Dict, 
+        tokenizer: PreTrainedTokenizerBase):
+    output_token_ids = tokenizer(response["generated_text"]).input_ids
+    return len(output_token_ids)
 
 class NaiveTransformersBackend(Backend):
-  def server_metrics(self) -> List[str]:
+  def get_server_metrics(self) -> List[str]:
     return [""]
-  def api_url(self) -> str:
+  def get_endpoint(self) -> str:
       return ""
-  def create_request_payload(self,  
-    prompt: str,
-    output_len: int,
-    top_k: int,):
+  def create_request_payload(self,
+      prompt: str,
+      prompt_len: int,
+      output_len: int,
+      best_of: int,
+      use_beam_search: bool,
+      top_k: int,
+      tokenizer: PreTrainedTokenizerBase,
+      sax_model: str,
+      model: str):
     return {
         "instances": [{
             "prompt": prompt,
@@ -373,21 +494,32 @@ def create_request_payload(self,
             "top_k": top_k,
         }]
     }
-  def tokens_from_response(self, response: Dict):
+  def get_response_length(
+        self, 
+        request_len: int, 
+        response: Dict, 
+        tokenizer: PreTrainedTokenizerBase):
     complete_pred = response["predictions"][0][0]["generated_text"]
     new_text_start_index = complete_pred.find(NEW_TEXT_KEY) + len(NEW_TEXT_KEY)
-    return complete_pred[new_text_start_index:]
+    pred = complete_pred[new_text_start_index:]
+    output_token_ids = tokenizer(pred).input_ids
+    return len(output_token_ids) - request_len
 
 class TensorrtLlmTritonBackend(Backend):
-  def server_metrics(self) -> List[str]:
+  def get_server_metrics(self) -> List[str]:
     return [""]
-  def api_url(self) -> str:
+  def get_endpoint(self) -> str:
       return ""
   def create_request_payload(self,
-    prompt: str,
-    output_len: int,
-    best_of: int,
-    use_beam_search: bool):
+      prompt: str,
+      prompt_len: int,
+      output_len: int,
+      best_of: int,
+      use_beam_search: bool,
+      top_k: int,
+      tokenizer: PreTrainedTokenizerBase,
+      sax_model: str,
+      model: str):
     return {
         "text_input": prompt,
         "max_tokens": output_len,
@@ -398,20 +530,29 @@ def create_request_payload(self,
         "stop_words": "",
         "stream": False,
     }
-  def tokens_from_response(self, response: Dict):
-    return response["text_output"]
+  def get_response_length(
+        self, 
+        request_len: int, 
+        response: Dict, 
+        tokenizer: PreTrainedTokenizerBase):
+    output_token_ids = tokenizer(response["text_output"]).input_ids
+    return len(output_token_ids)
 
 class SaxBackend(Backend):
-  def server_metrics(self) -> List[str]:
+  def get_server_metrics(self) -> List[str]:
     return [""]
-  def api_url(self) -> str:
+  def get_endpoint(self) -> str:
       return ""
   def create_request_payload(self,
-    prompt: str,
-    output_len: int,
-    best_of: int,
-    use_beam_search: bool,
-    sax_model: str):
+      prompt: str,
+      prompt_len: int,
+      output_len: int,
+      best_of: int,
+      use_beam_search: bool,
+      top_k: int,
+      tokenizer: PreTrainedTokenizerBase,
+      sax_model: str,
+      model: str):
     return {
         "model": sax_model,
         "prompt": prompt,
@@ -424,8 +565,13 @@ def create_request_payload(self,
         "max_tokens": output_len,
         "stream": False,
     }
-  def tokens_from_response(self, response: Dict):
-    return response["choices"][0]["text"]
+  def get_response_length(
+        self, 
+        request_len: int, 
+        response: Dict, 
+        tokenizer: PreTrainedTokenizerBase):
+    output_token_ids = tokenizer(response["choices"][0]["text"]).input_ids
+    return len(output_token_ids)
 
 def init_errors_map() -> Dict[str, int]:
   errors = {
@@ -792,8 +938,8 @@ def get_filtered_dataset(
     return filtered_dataset
 
 async def benchmark(
-    args: argparse.Namespace, 
-    api_url: str,
+    args: argparse.Namespace,
+    backend: Backend,
     tokenizer: PreTrainedTokenizerBase,
     model: str,
 ) -> BenchmarkingReport:
@@ -817,14 +963,13 @@ async def benchmark(
         "max_num_prompts": args.num_prompts,
       }]
     }
-  
   for index, step in enumerate(all_steps["steps"]):
   
     # No need to sleep before running the first step
     if 'time_between_steps' in args.job and index != 0:
       print(f"Sleeping for {args.job['time_between_steps']} sec...")
       await asyncio.sleep(args.job["time_between_steps"])
-    max_prompts = f" {step['max_num_prompts']} requests" if 'max_num_prompts' in step else " "
+    max_prompts = f" {step['max_num_prompts']} requests" if 'max_num_prompts' in step else ""
     duration = f" {step['time']} sec" if 'time' in step else " "
     print(f"Starting benchmarking{max_prompts} at {step['rate']} requests/sec for{duration}")
 
@@ -840,9 +985,8 @@ async def benchmark(
 
       prompt, prompt_len, output_len = request
       task = asyncio.create_task(
-          send_request(
-              args.backend,
-              api_url,
+          backend.send_request(
+              f"http://{args.host}:{args.port}",
               prompt,
               prompt_len,
               output_len,
@@ -931,8 +1075,9 @@ async def main(args: argparse.Namespace):
   )
   args.start_datetime = datetime.fromtimestamp(time.time_ns() / NS_IN_SEC)
   
+  backend: Backend = getBackend(args.backend)
   reports : List[BenchmarkingReport] = await asyncio.gather(
-    *[benchmark(args, api_url, tokenizer, model) for model in models]
+    *[benchmark(args, backend, tokenizer, model) for model in models]
   )
 
   if args.save_aggregated_result:

From 75f9acd7d4b406f6dcb8cc5fdf0b6779d3529b4d Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 12 Nov 2024 00:00:16 +0000
Subject: [PATCH 15/27] remove print

---
 .../tools/profile-generator/container/benchmark_serving.py       | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index 8cd98e956..910cf03d8 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -407,7 +407,6 @@ def get_response_length(
         request_len: int, 
         response: Dict, 
         tokenizer: PreTrainedTokenizerBase):
-    print(response)
     output_token_ids = tokenizer(response["choices"][0]["text"]).input_ids
     return len(output_token_ids)
 

From 7208868f998089e24b319cc1d1e652b83ec68eca Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 12 Nov 2024 18:34:16 +0000
Subject: [PATCH 16/27] remove duplicate methods

---
 .../container/benchmark_serving.py            | 803 +++++++-----------
 1 file changed, 317 insertions(+), 486 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index 910cf03d8..c503889e1 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -5,7 +5,7 @@
 It currently supports TGI, vLLM, Triton TensorRT-LLM and Saxml.
 """
 
-from abc import ABC, abstractmethod, abstractproperty
+from abc import ABC, abstractmethod
 import argparse
 import asyncio
 from datetime import datetime
@@ -38,225 +38,6 @@
 response_length_metric = Histogram("LatencyProfileGenerator:response_length", "Response length", buckets=[2**i for i in range(1, 16)])
 tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request')
   
-class BenchmarkConfig(TypedDict):
-    model: str
-    model_server: str
-    start_time: float
-
-class MetricSummary(TypedDict, total=False):
-  short_name:   Optional[str]
-  name:         str
-  description:  str
-  mean:         float
-  median:       Optional[float]
-  sd:           Optional[float]
-  min:          Optional[float]
-  max:          Optional[float]
-  p90:          Optional[float]
-  p99:          Optional[float]
-
-class BenchmarkingStepReport(TypedDict):
-  """Result for one step"""
-  request_rate: float
-  timestamp_start: float
-  timestamp_end: float
-  num_prompts_attempted: int
-  latencies: List
-  local_metrics: List[MetricSummary]
-  server_metrics: Optional[List[MetricSummary]]
-  errors: Dict[str, int]
-
-class BenchmarkingReport():
-  """Results for all steps for a single model"""
-  args: argparse.Namespace
-  config: BenchmarkConfig
-  steps: List[BenchmarkingStepReport]
-  
-  def __init__(self, args : argparse.Namespace, model: str, start_time: float):
-    self.args = args
-    self.config = BenchmarkConfig(
-      model        = model, 
-      model_server  = args.backend, 
-      start_time    = start_time
-    )
-    self.steps = []
-
-  def record_metrics_for_step(
-      self,
-      request_rate: float, 
-      timestamp_start: float,
-      timestamp_end: float,
-      num_prompts_attempted : int, 
-      latencies: List,
-      errors: Dict[str, int],
-    ):
-    def get_metrics_to_scrape(backend: str) -> List[str]:
-      if backend == "vllm":
-        return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"]
-      elif backend == "jetstream":
-        return [
-          "jetstream_slots_used_percentage",
-          "jetstream_prefill_backlog_size",
-        ]
-      else:
-        return []
-      
-    def metric_sumamry_from_points(name: str, description: str, points : List[float], short_name: Optional[str] = None) -> MetricSummary:
-        mean = np.mean(points) if points else 0
-        median = np.median(points) if points else 0
-        sd = np.std(points) if points else 0
-        min = np.min(points) if points else 0
-        max = np.max(points) if points else 0
-        p90 = np.percentile(points, 90) if points else 0
-        p99 = np.percentile(points, 99) if points else 0
-
-        return MetricSummary(
-          short_name = short_name if short_name is not None else name,
-          name = name,
-          description = description,
-          mean = float(mean),
-          median = float(median),
-          sd = float(sd),
-          min = float(min),
-          max = float(max),
-          p90 = float(p90),
-          p99 = float(p99)
-        ) 
-    
-    total_time = (timestamp_end - timestamp_start)/ NS_IN_SEC
-    if self.args.scrape_server_metrics:
-      server_metrics = fetch_metrics_from_gmp(get_metrics_to_scrape(self.args.backend), total_time, self.args.backend)
-
-    self.steps.append(BenchmarkingStepReport(
-      request_rate = request_rate,
-      timestamp_start = timestamp_start,
-      timestamp_end = timestamp_end,
-      num_prompts_attempted = num_prompts_attempted,
-      latencies = latencies,
-      errors = errors,
-      local_metrics = [
-        metric_sumamry_from_points( 
-          name="per_token_latency", 
-          description="seconds/token (includes waiting time on server)", 
-          points=[latency / (prompt_len + output_len) for prompt_len, output_len, latency in latencies]),
-        metric_sumamry_from_points(
-          name="latency", 
-          description="milliseconds/request (includes waiting time on server)" ,
-          points=[1000 * latency for _, _, latency in latencies]),
-        metric_sumamry_from_points(
-          short_name="tpot", 
-          name="per_output_token_latency", 
-          description="milliseconds/output_token (includes waiting time on server)", 
-          points=[1000 * latency / output_len for _, output_len, latency in latencies]),
-        metric_sumamry_from_points(
-          name="input_length", 
-          description="input length", 
-          points=[float(prompt_len) for prompt_len, _, _ in latencies]),
-        metric_sumamry_from_points(
-          name="output_length", 
-          description="output length", 
-          points=[float(output_len) for _, output_len, _ in latencies]),
-        MetricSummary(
-          name = "throughput",
-          description = "throughput",
-          mean = (len(latencies) / ((timestamp_end - timestamp_start) / NS_IN_SEC)),
-        ),
-      ],
-      server_metrics = server_metrics
-    ))
-
-  # Each element in the output list is a report for each step
-  def to_text_reports(self, write_to_files: bool = False) -> List[str]:
-    output : Dict[str, str] = {}
-    required_stats = ["latency", "throughput", "input_length", "output_length", "per_output_token_latency"]
-    for step in self.steps:
-     if not all(required_stat in [metric['name'] for metric in step['local_metrics']] for required_stat in required_stats):
-        raise Exception(f"All of the following stats must be recorded: {required_stats}")
-     
-    for step in self.steps:
-      step_output : List[str] = []
-      total_time = (step['timestamp_end'] - step['timestamp_start']) / NS_IN_SEC
-      total_output_tokens = np.sum([output_len for _, output_len, _ in step['latencies']])
-      output_tokens_per_second = total_output_tokens / total_time
-      output_tokens_per_min = 60 * output_tokens_per_second
-
-      total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in step['latencies']])
-      input_tokens_per_min = 60 * total_input_tokens / total_time
-
-      total_tokens = total_input_tokens + total_output_tokens
-      tokens_per_min = 60 * total_tokens / total_time
-      step_output.append(f"====Result for Model: {self.config['model']}====")
-      step_output.append(f"Errors: {step['errors']}")
-      step_output.append(f"Total time: {total_time:.2f} s")
-      step_output.append(f"Successful/total requests: {len(step['latencies'])}/{step['num_prompts_attempted']}")
-      step_output.append(f"Requests/min: {60 * step['num_prompts_attempted'] / total_time:.2f}")
-      step_output.append(f"Output_tokens/min: {output_tokens_per_min:.2f}")
-      step_output.append(f"Input_tokens/min: {input_tokens_per_min:.2f}")
-      step_output.append(f"Tokens/min: {tokens_per_min:.2f}")
-
-      if self.args.machine_cost:
-          step_output.append(
-              f"Cost $/1k tokens: {self.args.machine_cost * 1000 / (60 * output_tokens_per_min)}"
-          )
-      for metric in step['local_metrics']:
-        step_output.append(f"Average {metric['description']}:" f" {metric['mean']:.2f}")
-      output_filename = f"latency-profile-{datetime.fromtimestamp(step['timestamp_start'] / NS_IN_SEC).strftime('%Y-%m-%d_%H-%M-%S')}.txt"
-      output[output_filename] = '\n'.join(step_output)
-      if write_to_files:
-        with open(output_filename, 'w') as file:
-          file.write(output[output_filename])
-    return list(output.values())
-
-  # The output is a a single json summary of all steps
-  def to_json_report(self, write_to_file: bool = False) -> Dict:
-    output = {
-      "config": {
-        "num_models":  len(self.args.models) if self.args.save_aggregated_result else 1,
-        "start_time": {
-          "seconds" : self.steps[0]["timestamp_start"] // NS_IN_SEC,
-          "nanos" : self.steps[0]["timestamp_start"] % NS_IN_SEC,
-        },
-        **self.config,
-      },
-      "summary_stats": {
-        "stats": [
-            {
-              "request_rate": step["request_rate"],
-              **{metric["short_name"]: metric for metric in step["local_metrics"] if "short_name" in metric},
-              "model_server_metrics": [
-                  {"name": server_metric["name"], **server_metric}
-                  for server_metric in step["server_metrics"]
-              ] if step["server_metrics"] is not None else []
-            }
-            for step in self.steps
-        ]
-      },
-
-      # Legacy use case, use config if possible
-      "dimensions": {
-        "date": self.args.start_datetime.strftime('%Y%m%d-%H%M%S'),
-        "backend":  self.args.backend,
-        "model_id": self.config['model'],
-        "tokenizer_id": self.args.tokenizer,
-      } if len(self.steps) == 1 else None,
-      # Legacy use case, use summary_stats if possible
-      "metrics" : {
-      # Traffic
-        "num_prompts_attempted": 0,
-        "num_prompts_succeeded": 0,
-        "request_rate": self.steps[0]['request_rate'],
-      } if len(self.steps) == 1 else None,
-    }
-  
-    if write_to_file:
-      model_without_slash = self.config['model'].replace("/","-")
-      file_name = (
-          f"{self.args.file_prefix}-{self.args.backend}-{self.args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json"
-      )
-      with open(file_name, "w", encoding="utf-8") as outfile:
-        json.dump(output, outfile)
-    return output
-
 class Backend(ABC):
     """
     An abstract base class for Backend that defines the interface
@@ -572,6 +353,321 @@ def get_response_length(
     output_token_ids = tokenizer(response["choices"][0]["text"]).input_ids
     return len(output_token_ids)
 
+class BenchmarkConfig(TypedDict):
+    model: str
+    model_server: str
+    start_time: float
+
+class MetricSummary(TypedDict, total=False):
+  short_name:   Optional[str]
+  name:         str
+  description:  str
+  mean:         float
+  median:       Optional[float]
+  sd:           Optional[float]
+  min:          Optional[float]
+  max:          Optional[float]
+  p90:          Optional[float]
+  p99:          Optional[float]
+
+class BenchmarkingStepReport(TypedDict):
+  """Result for one step"""
+  request_rate: float
+  timestamp_start: float
+  timestamp_end: float
+  num_prompts_attempted: int
+  latencies: List
+  local_metrics: List[MetricSummary]
+  server_metrics: Optional[List[MetricSummary]]
+  errors: Dict[str, int]
+
+class BenchmarkingReport():
+  """Results for all steps for a single model"""
+  args: argparse.Namespace
+  config: BenchmarkConfig
+  steps: List[BenchmarkingStepReport]
+  
+  def __init__(self, args : argparse.Namespace, model: str, start_time: float):
+    self.args = args
+    self.config = BenchmarkConfig(
+      model        = model, 
+      model_server  = args.backend, 
+      start_time    = start_time
+    )
+    self.steps = []
+
+  def record_metrics_for_step(
+      self,
+      request_rate: float, 
+      timestamp_start: float,
+      timestamp_end: float,
+      num_prompts_attempted : int, 
+      latencies: List,
+      errors: Dict[str, int],
+      backend: Backend,
+    ):
+
+    def fetch_metrics_from_gmp(backend: Backend, duration: float) -> List[MetricSummary]:
+      """Gets summaries for metrics queried from GMP, queries vary per model server"""
+
+      # Creates a credentials object from the default service account file
+      # Assumes that script has appropriate default credentials set up, ref:
+      # https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials
+      credentials, project_id = google.auth.default()
+      # Prepare an authentication request - helps format the request auth token
+      auth_req = google.auth.transport.requests.Request()
+
+      # Request refresh tokens
+      credentials.refresh(auth_req)
+      url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/metadata' % (project_id)
+      headers_api = {'Authorization': 'Bearer ' + credentials.token}
+      request_post = requests.get(url=url, headers=headers_api)
+      all_metrics_metadata = request_post.json()
+      if request_post.ok is not True:
+        print("HTTP Error: %s" % (all_metrics_metadata))
+        return []
+      if all_metrics_metadata["status"] != "success":
+        print("Metadata error response: %s" % all_metrics_metadata["error"])
+        return []
+        
+      metrics_list : List[MetricSummary] = []
+      for metric in backend.get_server_metrics():
+        print("Metric Name: %s" % (metric))
+
+      # Find metric type
+        metric_type = all_metrics_metadata['data'][metric]
+        if all_metrics_metadata['data'][metric] is None:
+          print("No metric found for: %s" % metric)
+          return []
+        metric_type = metric_type[0]['type']
+
+        metric_results = {}
+        # Queries scrape all metrics collected from the last $DURATION seconds from the backend's related
+        # podmonitoring spec assumed to be named "$BACKEND-podmonitoring"
+        queries = {
+          "gauge": {
+            "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, self.args.backend, duration),
+            "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, self.args.backend, duration),
+            "Sd": "stddev_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, self.args.backend, duration),
+            "Min": "min_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, self.args.backend, duration),
+            "Max": "max_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, self.args.backend, duration),
+            "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, self.args.backend, duration),
+            "P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, self.args.backend, duration),
+          },
+        "histogram": {
+            "Mean": "sum(rate(%s_sum{job='%s-podmonitoring'}[%.0fs])) / sum(rate(%s_count{job='%s-podmonitoring'}[%.0fs]))" % (metric, self.args.backend, duration, metric, self.args.backend, duration),
+            "Median": "histogram_quantile(0.5, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, self.args.backend, duration),
+            "Min": "histogram_quantile(0, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, self.args.backend, duration),
+            "Max": "histogram_quantile(1, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, self.args.backend, duration),
+            "P90": "histogram_quantile(0.9, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, self.args.backend, duration),
+            "P99": "histogram_quantile(0.99, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, self.args.backend, duration),
+          }
+        }
+
+        metric_data : MetricSummary = {
+              "name": metric,
+              "description": f"Metrics for {metric} from {self.args.backend} backend",
+            }
+        for query_name, query in queries[metric_type].items():
+            
+            # Configure respective query
+            url = f'https://monitoring.googleapis.com/v1/projects/{project_id}/location/global/prometheus/api/v1/query'
+            headers_api = {'Authorization': f'Bearer {credentials.token}'}
+            params = {'query': query}
+            
+            request_post = requests.get(url=url, headers=headers_api, params=params)
+            response = request_post.json()
+
+            # handle response
+            if request_post.ok:
+              if response["status"] == "success":
+                metric_results[query_name] = float(response["data"]["result"][0]["value"][1])
+                print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1]))
+              else:
+                print("Cloud Monitoring PromQL Error: %s" % (response["error"]))
+            else:
+              print("HTTP Error: %s" % (response))
+              
+            # Handle response
+            if request_post.ok and response["status"] == "success":
+                result_value = float(response["data"]["result"][0]["value"][1])
+                if query_name == "Mean":
+                    metric_data["mean"] = result_value
+                elif query_name == "Median":
+                    metric_data["median"] = result_value
+                elif query_name == "Sd":
+                    metric_data["sd"] = result_value
+                elif query_name == "Min":
+                    metric_data["min"] = result_value
+                elif query_name == "Max":
+                    metric_data["max"] = result_value
+                elif query_name == "P90":
+                    metric_data["p90"] = result_value
+                elif query_name == "P99":
+                    metric_data["p99"] = result_value
+            else:
+                error_message = response.get("error", "HTTP Error")
+                print(f"Error fetching {query_name} for {metric}: {error_message}")
+          
+        metrics_list.append(metric_data)
+      return metrics_list
+
+    def metric_sumamry_from_points(name: str, description: str, points : List[float], short_name: Optional[str] = None) -> MetricSummary:
+        mean = np.mean(points) if points else 0
+        median = np.median(points) if points else 0
+        sd = np.std(points) if points else 0
+        min = np.min(points) if points else 0
+        max = np.max(points) if points else 0
+        p90 = np.percentile(points, 90) if points else 0
+        p99 = np.percentile(points, 99) if points else 0
+
+        return MetricSummary(
+          short_name = short_name if short_name is not None else name,
+          name = name,
+          description = description,
+          mean = float(mean),
+          median = float(median),
+          sd = float(sd),
+          min = float(min),
+          max = float(max),
+          p90 = float(p90),
+          p99 = float(p99)
+        ) 
+    
+    total_time = (timestamp_end - timestamp_start)/ NS_IN_SEC
+    if self.args.scrape_server_metrics:
+      server_metrics = fetch_metrics_from_gmp(backend, total_time)
+
+    self.steps.append(BenchmarkingStepReport(
+      request_rate = request_rate,
+      timestamp_start = timestamp_start,
+      timestamp_end = timestamp_end,
+      num_prompts_attempted = num_prompts_attempted,
+      latencies = latencies,
+      errors = errors,
+      local_metrics = [
+        metric_sumamry_from_points( 
+          name="per_token_latency", 
+          description="seconds/token (includes waiting time on server)", 
+          points=[latency / (prompt_len + output_len) for prompt_len, output_len, latency in latencies]),
+        metric_sumamry_from_points(
+          name="latency", 
+          description="milliseconds/request (includes waiting time on server)" ,
+          points=[1000 * latency for _, _, latency in latencies]),
+        metric_sumamry_from_points(
+          short_name="tpot", 
+          name="per_output_token_latency", 
+          description="milliseconds/output_token (includes waiting time on server)", 
+          points=[1000 * latency / output_len for _, output_len, latency in latencies]),
+        metric_sumamry_from_points(
+          name="input_length", 
+          description="input length", 
+          points=[float(prompt_len) for prompt_len, _, _ in latencies]),
+        metric_sumamry_from_points(
+          name="output_length", 
+          description="output length", 
+          points=[float(output_len) for _, output_len, _ in latencies]),
+        MetricSummary(
+          name = "throughput",
+          description = "throughput",
+          mean = (len(latencies) / ((timestamp_end - timestamp_start) / NS_IN_SEC)),
+        ),
+      ],
+      server_metrics = server_metrics
+    ))
+
+  # Each element in the output list is a report for each step
+  def to_text_reports(self, write_to_files: bool = False) -> List[str]:
+    output : Dict[str, str] = {}
+    required_stats = ["latency", "throughput", "input_length", "output_length", "per_output_token_latency"]
+    for step in self.steps:
+     if not all(required_stat in [metric['name'] for metric in step['local_metrics']] for required_stat in required_stats):
+        raise Exception(f"All of the following stats must be recorded: {required_stats}")
+     
+    for step in self.steps:
+      step_output : List[str] = []
+      total_time = (step['timestamp_end'] - step['timestamp_start']) / NS_IN_SEC
+      total_output_tokens = np.sum([output_len for _, output_len, _ in step['latencies']])
+      output_tokens_per_second = total_output_tokens / total_time
+      output_tokens_per_min = 60 * output_tokens_per_second
+
+      total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in step['latencies']])
+      input_tokens_per_min = 60 * total_input_tokens / total_time
+
+      total_tokens = total_input_tokens + total_output_tokens
+      tokens_per_min = 60 * total_tokens / total_time
+      step_output.append(f"====Result for Model: {self.config['model']}====")
+      step_output.append(f"Errors: {step['errors']}")
+      step_output.append(f"Total time: {total_time:.2f} s")
+      step_output.append(f"Successful/total requests: {len(step['latencies'])}/{step['num_prompts_attempted']}")
+      step_output.append(f"Requests/min: {60 * step['num_prompts_attempted'] / total_time:.2f}")
+      step_output.append(f"Output_tokens/min: {output_tokens_per_min:.2f}")
+      step_output.append(f"Input_tokens/min: {input_tokens_per_min:.2f}")
+      step_output.append(f"Tokens/min: {tokens_per_min:.2f}")
+
+      if self.args.machine_cost:
+          step_output.append(
+              f"Cost $/1k tokens: {self.args.machine_cost * 1000 / (60 * output_tokens_per_min)}"
+          )
+      for metric in step['local_metrics']:
+        step_output.append(f"Average {metric['description']}:" f" {metric['mean']:.2f}")
+      output_filename = f"latency-profile-{datetime.fromtimestamp(step['timestamp_start'] / NS_IN_SEC).strftime('%Y-%m-%d_%H-%M-%S')}.txt"
+      output[output_filename] = '\n'.join(step_output)
+      if write_to_files:
+        with open(output_filename, 'w') as file:
+          file.write(output[output_filename])
+    return list(output.values())
+
+  # The output is a a single json summary of all steps
+  def to_json_report(self, write_to_file: bool = False) -> Dict:
+    output = {
+      "config": {
+        "num_models":  len(self.args.models) if self.args.save_aggregated_result else 1,
+        "start_time": {
+          "seconds" : self.steps[0]["timestamp_start"] // NS_IN_SEC,
+          "nanos" : self.steps[0]["timestamp_start"] % NS_IN_SEC,
+        },
+        **self.config,
+      },
+      "summary_stats": {
+        "stats": [
+            {
+              "request_rate": step["request_rate"],
+              **{metric["short_name"]: metric for metric in step["local_metrics"] if "short_name" in metric},
+              "model_server_metrics": [
+                  {"name": server_metric["name"], **server_metric}
+                  for server_metric in step["server_metrics"]
+              ] if step["server_metrics"] is not None else []
+            }
+            for step in self.steps
+        ]
+      },
+
+      # Legacy use case, use config if possible
+      "dimensions": {
+        "date": self.args.start_datetime.strftime('%Y%m%d-%H%M%S'),
+        "backend":  self.args.backend,
+        "model_id": self.config['model'],
+        "tokenizer_id": self.args.tokenizer,
+      } if len(self.steps) == 1 else None,
+      # Legacy use case, use summary_stats if possible
+      "metrics" : {
+      # Traffic
+        "num_prompts_attempted": 0,
+        "num_prompts_succeeded": 0,
+        "request_rate": self.steps[0]['request_rate'],
+      } if len(self.steps) == 1 else None,
+    }
+  
+    if write_to_file:
+      model_without_slash = self.config['model'].replace("/","-")
+      file_name = (
+          f"{self.args.file_prefix}-{self.args.backend}-{self.args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json"
+      )
+      with open(file_name, "w", encoding="utf-8") as outfile:
+        json.dump(output, outfile)
+    return output
+
 def init_errors_map() -> Dict[str, int]:
   errors = {
     "ClientConnectorError": 0,
@@ -599,112 +695,6 @@ def getBackend(backend: str) -> Backend:
   else:
     raise ValueError("Unsupported backend")
 
-
-def fetch_metrics_from_gmp(metrics: List[str], duration: float, backend: str) -> List[MetricSummary]:
-    """Gets summaries for metrics queried from GMP, queries vary per model server"""
-
-    # Creates a credentials object from the default service account file
-    # Assumes that script has appropriate default credentials set up, ref:
-    # https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials
-    credentials, project_id = google.auth.default()
-    # Prepare an authentication request - helps format the request auth token
-    auth_req = google.auth.transport.requests.Request()
-
-    # Request refresh tokens
-    credentials.refresh(auth_req)
-    url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/metadata' % (project_id)
-    headers_api = {'Authorization': 'Bearer ' + credentials.token}
-    request_post = requests.get(url=url, headers=headers_api)
-    all_metrics_metadata = request_post.json()
-    if request_post.ok is not True:
-      print("HTTP Error: %s" % (all_metrics_metadata))
-      return []
-    if all_metrics_metadata["status"] != "success":
-      print("Metadata error response: %s" % all_metrics_metadata["error"])
-      return []
-      
-    metrics_list : List[MetricSummary] = []
-    for metric in metrics:
-      print("Metric Name: %s" % (metric))
-
-     # Find metric type
-      metric_type = all_metrics_metadata['data'][metric]
-      if all_metrics_metadata['data'][metric] is None:
-        print("No metric found for: %s" % metric)
-        return []
-      metric_type = metric_type[0]['type']
-
-      metric_results = {}
-      # Queries scrape all metrics collected from the last $DURATION seconds from the backend's related
-      # podmonitoring spec assumed to be named "$BACKEND-podmonitoring"
-      queries = {
-        "gauge": {
-          "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
-          "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
-          "Sd": "stddev_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
-          "Min": "min_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
-          "Max": "max_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
-          "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
-          "P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
-        },
-      "histogram": {
-          "Mean": "sum(rate(%s_sum{job='%s-podmonitoring'}[%.0fs])) / sum(rate(%s_count{job='%s-podmonitoring'}[%.0fs]))" % (metric, backend, duration, metric, backend, duration),
-          "Median": "histogram_quantile(0.5, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
-          "Min": "histogram_quantile(0, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
-          "Max": "histogram_quantile(1, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
-          "P90": "histogram_quantile(0.9, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
-          "P99": "histogram_quantile(0.99, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
-        }
-      }
-
-      metric_data : MetricSummary = {
-            "name": metric,
-            "description": f"Metrics for {metric} from {backend} backend",
-          }
-      for query_name, query in queries[metric_type].items():
-          
-          # Configure respective query
-          url = f'https://monitoring.googleapis.com/v1/projects/{project_id}/location/global/prometheus/api/v1/query'
-          headers_api = {'Authorization': f'Bearer {credentials.token}'}
-          params = {'query': query}
-          
-          request_post = requests.get(url=url, headers=headers_api, params=params)
-          response = request_post.json()
-
-          # handle response
-          if request_post.ok:
-            if response["status"] == "success":
-              metric_results[query_name] = float(response["data"]["result"][0]["value"][1])
-              print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1]))
-            else:
-              print("Cloud Monitoring PromQL Error: %s" % (response["error"]))
-          else:
-            print("HTTP Error: %s" % (response))
-            
-          # Handle response
-          if request_post.ok and response["status"] == "success":
-              result_value = float(response["data"]["result"][0]["value"][1])
-              if query_name == "Mean":
-                  metric_data["mean"] = result_value
-              elif query_name == "Median":
-                  metric_data["median"] = result_value
-              elif query_name == "Sd":
-                  metric_data["sd"] = result_value
-              elif query_name == "Min":
-                  metric_data["min"] = result_value
-              elif query_name == "Max":
-                  metric_data["max"] = result_value
-              elif query_name == "P90":
-                  metric_data["p90"] = result_value
-              elif query_name == "P99":
-                  metric_data["p99"] = result_value
-          else:
-              error_message = response.get("error", "HTTP Error")
-              print(f"Error fetching {query_name} for {metric}: {error_message}")
-        
-      metrics_list.append(metric_data)
-    return metrics_list
-
 async def generate_next_request(
     input_requests: List[Tuple[str, int, int]],
     request_rate_expr: str,
@@ -729,159 +719,6 @@ async def generate_next_request(
     # The next request will be sent after the interval.
     await asyncio.sleep(interval)
 
-async def send_request(
-    backend: str,
-    api_url: str,
-    prompt: str,
-    prompt_len: int,
-    output_len: int,
-    best_of: int,
-    use_beam_search: bool,
-    top_k: int,
-    tokenizer: PreTrainedTokenizerBase,
-    sax_model: str,
-    model: str,
-) -> Tuple[Optional[Tuple[int, int, float]], Optional[Dict[str, int]]]:
-  """Sends request to server."""
-  request_start_time = time.time()
-  errors = init_errors_map()
-
-  headers = {"User-Agent": "Benchmark Client"}
-  if backend == "vllm":
-    pload = {
-        "model": model,
-        "prompt": prompt,
-        "n": 1,
-        "best_of": best_of,
-        "use_beam_search": use_beam_search,
-        "temperature": 0.0 if use_beam_search else 1.0,
-        "top_p": 1.0,
-        "max_tokens": output_len,
-        "ignore_eos": False,
-        "stream": False,
-    }
-  elif backend == "tgi":
-    params = {
-        "best_of": best_of,
-        "max_new_tokens": output_len,
-        "do_sample": True,
-    }
-    pload = {
-        "inputs": prompt,
-        "parameters": params,
-    }
-  elif backend == "naive_transformers":
-    # If max_length or top_k is not specified _MAX_LENGTH_DEFAULT = 200 and
-    # _TOP_K_DEFAULT = 10 in peft/handler.py will be used.
-    pload = {
-        "instances": [{
-            "prompt": prompt,
-            "max_length": output_len,
-            "top_k": top_k,
-        }]
-    }
-  elif backend == "tensorrt_llm_triton":
-    pload = {
-        "text_input": prompt,
-        "max_tokens": output_len,
-        "beam_width": 1 if not use_beam_search else best_of,
-        "temperature": 0.0 if use_beam_search else 1.0,
-        "top_p": 1.0,
-        "bad_words": "",
-        "stop_words": "",
-        "stream": False,
-    }
-  elif backend == "sax":
-    pload = {
-        "model": sax_model,
-        "prompt": prompt,
-        "n": 1,
-        "best_of": best_of,
-        "use_beam_search": use_beam_search,
-        "temperature": 0.0 if use_beam_search else 1.0,
-        "top_p": 1.0,
-        "top_k": 50,
-        "max_tokens": output_len,
-        "stream": False,
-    }
-  elif backend == "jetstream":
-    pload = {
-        "prompt": prompt,
-        "max_tokens": output_len,
-    }
-  else:
-    raise ValueError(f"Unknown backend: {backend}")
-
-  # Set client timeout to be 3 hrs.
-  timeout = aiohttp.ClientTimeout(total=CLIENT_TIMEOUT_SEC)
-  async with aiohttp.ClientSession(timeout=timeout,trust_env=True) as session:
-    while True:
-      try:
-        async with session.post(api_url, headers=headers, json=pload, ssl=False) as response:
-           output = await response.json()
-
-        # Re-send the request if it failed.
-        if "error" not in output:
-          break
-      except aiohttp.client_exceptions.ClientConnectorError as client_err:
-        errors["ClientConnectorError"] += 1
-        print(f"ClientConnectorError: {client_err}")
-        return None, errors
-      except asyncio.TimeoutError as timeout_err:
-        errors["TimeoutError"] += 1
-        print(f"TimeoutError: {timeout_err}")
-        return None, errors
-      except aiohttp.client_exceptions.ClientOSError as e:
-        errors["ClientOSError"] += 1
-        print(f"ClientOSError: {e}")
-        return None, errors
-      except aiohttp.client_exceptions.ContentTypeError as e:
-        print(f"ContentTypeError: {e}, response: {response}")
-        errors["ContentTypeError"] += 1
-        return None, errors
-      except aiohttp.client_exceptions.ServerDisconnectedError as e:
-        errors["ServerDisconnectedError"] += 1
-        print(f"ServerDisconnectedError: {e}")
-        return None, errors
-      except Exception as e: 
-        print(f"Unknown error {e}")
-        errors["unknown_error"] += 1
-        return None, errors
-
-  request_end_time = time.time()
-  # Naive HF transformers generation and TensorRT-LLM generation stops at EOS
-  # tokens and the generation may be shorter than the ground-truth output
-  # sequence length.
-  if backend == "naive_transformers":
-    complete_pred = output["predictions"][0][0]["generated_text"]
-    new_text_start_index = complete_pred.find(NEW_TEXT_KEY) + len(NEW_TEXT_KEY)
-    pred = complete_pred[new_text_start_index:]
-    output_token_ids = tokenizer(pred).input_ids
-    output_len = len(output_token_ids) - prompt_len
-  elif backend == "tensorrt_llm_triton":
-    output_token_ids = tokenizer(output["text_output"]).input_ids
-    output_len = len(output_token_ids)
-  elif backend == "sax":
-    output_token_ids = tokenizer(output["choices"][0]["text"]).input_ids
-    output_len = len(output_token_ids)
-  elif backend == "tgi":
-    output_token_ids = tokenizer(output["generated_text"]).input_ids
-    output_len = len(output_token_ids)
-  elif backend == "vllm":
-    output_token_ids = tokenizer(output["choices"][0]["text"]).input_ids
-    output_len = len(output_token_ids)
-  elif backend == "jetstream":
-    output_token_ids = tokenizer(output["response"]).input_ids
-    output_len = len(output_token_ids)
-
-  # (prompt len, output len, latency, success)
-  request_latency = (prompt_len, output_len, (request_end_time - request_start_time))
-  tpot_metric.observe((request_end_time - request_start_time) / output_len)
-  prompt_length_metric.observe(prompt_len)
-  response_length_metric.observe(output_len)
-
-  return request_latency, None
-
 def get_filtered_dataset(
     dataset_path: str,
     max_input_len: int,
@@ -1013,7 +850,7 @@ async def benchmark(
       if errors:
         for err, count in errors.items():
           all_errors[err] = all_errors[err] + count
-    benchmark_results.record_metrics_for_step(step['rate'], step_start_timestamp, step_end_timestamp, prompts_sent_this_step, all_latencies, all_errors)
+    benchmark_results.record_metrics_for_step(step['rate'], step_start_timestamp, step_end_timestamp, prompts_sent_this_step, all_latencies, all_errors, backend)
   
   print(f"Completed all steps, generating reports...")
   return benchmark_results
@@ -1059,16 +896,10 @@ async def main(args: argparse.Namespace):
   print(f"Models to benchmark: {models}")
   random.seed(args.seed)
   np.random.seed(args.seed)
-  endpoint = (
-    "v1/completions"
-    if args.backend == "vllm"
-    else args.endpoint
-  )
 
   print(f"Starting Prometheus Server on port {PROMETHEUS_PORT}")
   start_http_server(PROMETHEUS_PORT)
 
-  api_url = f"http://{args.host}:{args.port}/{endpoint}"
   tokenizer = AutoTokenizer.from_pretrained(
       args.tokenizer, trust_remote_code=args.trust_remote_code
   )

From 8517fd9a0a8a94673c3d8897a7cebf82c3f3cfde Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 12 Nov 2024 23:20:17 +0000
Subject: [PATCH 17/27] changes to json report

---
 .../container/benchmark_serving.py            | 45 +++++++++++--------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index c503889e1..ce6c44ecd 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -359,7 +359,7 @@ class BenchmarkConfig(TypedDict):
     start_time: float
 
 class MetricSummary(TypedDict, total=False):
-  short_name:   Optional[str]
+  json_field_name:   Optional[str]
   name:         str
   description:  str
   mean:         float
@@ -512,7 +512,7 @@ def fetch_metrics_from_gmp(backend: Backend, duration: float) -> List[MetricSumm
         metrics_list.append(metric_data)
       return metrics_list
 
-    def metric_sumamry_from_points(name: str, description: str, points : List[float], short_name: Optional[str] = None) -> MetricSummary:
+    def metric_sumamry_from_points(name: str, description: str, points : List[float], json_field_name: Optional[str] = None) -> MetricSummary:
         mean = np.mean(points) if points else 0
         median = np.median(points) if points else 0
         sd = np.std(points) if points else 0
@@ -522,7 +522,7 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float]
         p99 = np.percentile(points, 99) if points else 0
 
         return MetricSummary(
-          short_name = short_name if short_name is not None else name,
+          json_field_name = json_field_name if json_field_name is not None else name,
           name = name,
           description = description,
           mean = float(mean),
@@ -551,21 +551,22 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float]
           description="seconds/token (includes waiting time on server)", 
           points=[latency / (prompt_len + output_len) for prompt_len, output_len, latency in latencies]),
         metric_sumamry_from_points(
-          name="latency", 
+          json_field_name="request_latency", 
+          name="latency",
           description="milliseconds/request (includes waiting time on server)" ,
           points=[1000 * latency for _, _, latency in latencies]),
         metric_sumamry_from_points(
-          short_name="tpot", 
+          json_field_name="tpot", 
           name="per_output_token_latency", 
           description="milliseconds/output_token (includes waiting time on server)", 
           points=[1000 * latency / output_len for _, output_len, latency in latencies]),
         metric_sumamry_from_points(
           name="input_length", 
-          description="input length", 
+          description="length of prompt", 
           points=[float(prompt_len) for prompt_len, _, _ in latencies]),
         metric_sumamry_from_points(
           name="output_length", 
-          description="output length", 
+          description="length of response", 
           points=[float(output_len) for _, output_len, _ in latencies]),
         MetricSummary(
           name = "throughput",
@@ -622,18 +623,18 @@ def to_text_reports(self, write_to_files: bool = False) -> List[str]:
   def to_json_report(self, write_to_file: bool = False) -> Dict:
     output = {
       "config": {
+         **self.config,
         "num_models":  len(self.args.models) if self.args.save_aggregated_result else 1,
         "start_time": {
           "seconds" : self.steps[0]["timestamp_start"] // NS_IN_SEC,
           "nanos" : self.steps[0]["timestamp_start"] % NS_IN_SEC,
         },
-        **self.config,
       },
       "summary_stats": {
         "stats": [
             {
               "request_rate": step["request_rate"],
-              **{metric["short_name"]: metric for metric in step["local_metrics"] if "short_name" in metric},
+              **{metric["json_field_name"]: metric for metric in step["local_metrics"] if "json_field_name" in metric},
               "model_server_metrics": [
                   {"name": server_metric["name"], **server_metric}
                   for server_metric in step["server_metrics"]
@@ -649,14 +650,22 @@ def to_json_report(self, write_to_file: bool = False) -> Dict:
         "backend":  self.args.backend,
         "model_id": self.config['model'],
         "tokenizer_id": self.args.tokenizer,
-      } if len(self.steps) == 1 else None,
+      } if len(self.args.models.split(',')) == 1 else None,
       # Legacy use case, use summary_stats if possible
-      "metrics" : {
-      # Traffic
-        "num_prompts_attempted": 0,
-        "num_prompts_succeeded": 0,
-        "request_rate": self.steps[0]['request_rate'],
-      } if len(self.steps) == 1 else None,
+      "metrics": {
+          # Traffic metrics
+          "num_prompts_attempted": self.steps[0]['num_prompts_attempted'],
+          "num_prompts_succeeded": self.steps[0]['latencies'],
+          "request_rate": self.steps[0]['request_rate'],
+          
+          **{
+              f"{stat}_{metric['name']}": value
+              for metric in self.steps[0]["local_metrics"]
+              if "json_field_name" in metric
+              for stat, value in metric.items()
+              if stat not in ["name", "description", "json_field_name"] and value is not None
+          }
+      } if len(self.steps) == 1 else None
     }
   
     if write_to_file:
@@ -679,7 +688,7 @@ def init_errors_map() -> Dict[str, int]:
   }
   return errors
 
-def getBackend(backend: str) -> Backend:
+def get_backend(backend: str) -> Backend:
   if backend == "vllm":
     return vLLMBackend()
   elif backend == "tgi":
@@ -905,7 +914,7 @@ async def main(args: argparse.Namespace):
   )
   args.start_datetime = datetime.fromtimestamp(time.time_ns() / NS_IN_SEC)
   
-  backend: Backend = getBackend(args.backend)
+  backend: Backend = get_backend(args.backend)
   reports : List[BenchmarkingReport] = await asyncio.gather(
     *[benchmark(args, backend, tokenizer, model) for model in models]
   )

From 2cb77b72c27fad8451b5da7156c0e10c63becfe5 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 12 Nov 2024 23:30:58 +0000
Subject: [PATCH 18/27] nit

---
 .../tools/profile-generator/container/benchmark_serving.py   | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index ce6c44ecd..6e796dae8 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -570,7 +570,7 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float]
           points=[float(output_len) for _, output_len, _ in latencies]),
         MetricSummary(
           name = "throughput",
-          description = "throughput",
+          description = "throughput in requests per second",
           mean = (len(latencies) / ((timestamp_end - timestamp_start) / NS_IN_SEC)),
         ),
       ],
@@ -655,7 +655,7 @@ def to_json_report(self, write_to_file: bool = False) -> Dict:
       "metrics": {
           # Traffic metrics
           "num_prompts_attempted": self.steps[0]['num_prompts_attempted'],
-          "num_prompts_succeeded": self.steps[0]['latencies'],
+          "num_prompts_succeeded": len(self.steps[0]['latencies']),
           "request_rate": self.steps[0]['request_rate'],
           
           **{
@@ -663,7 +663,6 @@ def to_json_report(self, write_to_file: bool = False) -> Dict:
               for metric in self.steps[0]["local_metrics"]
               if "json_field_name" in metric
               for stat, value in metric.items()
-              if stat not in ["name", "description", "json_field_name"] and value is not None
           }
       } if len(self.steps) == 1 else None
     }

From 2aa34bf9bc9ae92f2c900beae6e172abdedbe629 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 12 Nov 2024 23:33:37 +0000
Subject: [PATCH 19/27] revert

---
 .../tools/profile-generator/container/benchmark_serving.py       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index 6e796dae8..80daeb2b4 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -663,6 +663,7 @@ def to_json_report(self, write_to_file: bool = False) -> Dict:
               for metric in self.steps[0]["local_metrics"]
               if "json_field_name" in metric
               for stat, value in metric.items()
+              if stat not in ["name", "description", "json_field_name"] and value is not None
           }
       } if len(self.steps) == 1 else None
     }

From b02e1090fade5540282439d108cdf966272a6ddb Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 12 Nov 2024 23:46:32 +0000
Subject: [PATCH 20/27] missing server_metrics in metrics

---
 .../container/benchmark_serving.py            | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index 80daeb2b4..b843e7001 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -360,15 +360,15 @@ class BenchmarkConfig(TypedDict):
 
 class MetricSummary(TypedDict, total=False):
   json_field_name:   Optional[str]
-  name:         str
-  description:  str
-  mean:         float
-  median:       Optional[float]
-  sd:           Optional[float]
-  min:          Optional[float]
-  max:          Optional[float]
-  p90:          Optional[float]
-  p99:          Optional[float]
+  name:              str
+  description:       str
+  mean:              float
+  median:            Optional[float]
+  sd:                Optional[float]
+  min:               Optional[float]
+  max:               Optional[float]
+  p90:               Optional[float]
+  p99:               Optional[float]
 
 class BenchmarkingStepReport(TypedDict):
   """Result for one step"""
@@ -621,6 +621,7 @@ def to_text_reports(self, write_to_files: bool = False) -> List[str]:
 
   # The output is a a single json summary of all steps
   def to_json_report(self, write_to_file: bool = False) -> Dict:
+    print(self.steps[0]["local_metrics"])
     output = {
       "config": {
          **self.config,
@@ -634,7 +635,7 @@ def to_json_report(self, write_to_file: bool = False) -> Dict:
         "stats": [
             {
               "request_rate": step["request_rate"],
-              **{metric["json_field_name"]: metric for metric in step["local_metrics"] if "json_field_name" in metric},
+              **{(metric["json_field_name"] if "json_field_name" in metric else metric["name"]): metric for metric in step["local_metrics"]},
               "model_server_metrics": [
                   {"name": server_metric["name"], **server_metric}
                   for server_metric in step["server_metrics"]
@@ -657,14 +658,17 @@ def to_json_report(self, write_to_file: bool = False) -> Dict:
           "num_prompts_attempted": self.steps[0]['num_prompts_attempted'],
           "num_prompts_succeeded": len(self.steps[0]['latencies']),
           "request_rate": self.steps[0]['request_rate'],
-          
           **{
               f"{stat}_{metric['name']}": value
               for metric in self.steps[0]["local_metrics"]
               if "json_field_name" in metric
               for stat, value in metric.items()
               if stat not in ["name", "description", "json_field_name"] and value is not None
-          }
+          },
+          "server_metrics": [
+                  {"name": server_metric["name"], **server_metric}
+                  for server_metric in step["server_metrics"]
+              ] if self.steps[0]["server_metrics"] is not None else []
       } if len(self.steps) == 1 else None
     }
   

From 4f7af86bd230ed322db8f0cdfe18885c8b6e7b68 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 12 Nov 2024 23:47:55 +0000
Subject: [PATCH 21/27] nit

---
 .../tools/profile-generator/container/benchmark_serving.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index b843e7001..261ddb102 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -667,7 +667,7 @@ def to_json_report(self, write_to_file: bool = False) -> Dict:
           },
           "server_metrics": [
                   {"name": server_metric["name"], **server_metric}
-                  for server_metric in step["server_metrics"]
+                  for server_metric in self.steps[0]["server_metrics"]
               ] if self.steps[0]["server_metrics"] is not None else []
       } if len(self.steps) == 1 else None
     }

From 1838f44d7f388b6adbb4cf2524c8dcdc2da7ce77 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 12 Nov 2024 23:50:26 +0000
Subject: [PATCH 22/27] remove prints

---
 .../tools/profile-generator/container/benchmark_serving.py      | 2 --
 1 file changed, 2 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index 261ddb102..4a4e9bdf7 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -621,7 +621,6 @@ def to_text_reports(self, write_to_files: bool = False) -> List[str]:
 
   # The output is a a single json summary of all steps
   def to_json_report(self, write_to_file: bool = False) -> Dict:
-    print(self.steps[0]["local_metrics"])
     output = {
       "config": {
          **self.config,
@@ -1056,7 +1055,6 @@ def parse_request_rates(input_str):
       if os.path.isfile(input_str):
           with open(input_str, 'r') as file:
               input_str = file.read()
-      print(input_str)
       try:
           # Parse the input string as JSON
           request_data = json.loads(input_str)

From 3ba738b59bbafa25b4d07038aa49e4d464941be7 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 12 Nov 2024 23:52:44 +0000
Subject: [PATCH 23/27] tweak fields

---
 .../tools/profile-generator/container/benchmark_serving.py     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index 4a4e9bdf7..161c303ad 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -654,8 +654,7 @@ def to_json_report(self, write_to_file: bool = False) -> Dict:
       # Legacy use case, use summary_stats if possible
       "metrics": {
           # Traffic metrics
-          "num_prompts_attempted": self.steps[0]['num_prompts_attempted'],
-          "num_prompts_succeeded": len(self.steps[0]['latencies']),
+          "num_prompts": self.steps[0]['num_prompts_attempted'],
           "request_rate": self.steps[0]['request_rate'],
           **{
               f"{stat}_{metric['name']}": value

From 98785c0bd739c74fc0181a9592b664a24d4cdc17 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Wed, 13 Nov 2024 17:55:11 +0000
Subject: [PATCH 24/27] correct json output

---
 .../container/benchmark_serving.py            | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index 161c303ad..c5ec11718 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -634,7 +634,13 @@ def to_json_report(self, write_to_file: bool = False) -> Dict:
         "stats": [
             {
               "request_rate": step["request_rate"],
-              **{(metric["json_field_name"] if "json_field_name" in metric else metric["name"]): metric for metric in step["local_metrics"]},
+              **{(metric["json_field_name"] if "json_field_name" in metric else metric["name"]): {
+                      stat: value
+                      for stat, value in metric.items()
+                      if stat not in ["name", "description", "json_field_name"] and value is not None
+                  }
+                  for metric in step["local_metrics"]
+              },
               "model_server_metrics": [
                   {"name": server_metric["name"], **server_metric}
                   for server_metric in step["server_metrics"]
@@ -656,17 +662,20 @@ def to_json_report(self, write_to_file: bool = False) -> Dict:
           # Traffic metrics
           "num_prompts": self.steps[0]['num_prompts_attempted'],
           "request_rate": self.steps[0]['request_rate'],
+          "benchmark_time": (self.steps[0]['timestamp_end'] - self.steps[0]['timestamp_start']) / NS_IN_SEC,
+          "throughput_rps": (len(self.steps[0]['latencies']) / ((self.steps[0]['timestamp_end'] - self.steps[0]['timestamp_start']) / NS_IN_SEC)),
+          "throughput": np.sum([output_len for _, output_len, _ in self.steps[0]['latencies']]) / ((self.steps[0]['timestamp_end'] - self.steps[0]['timestamp_start']) / NS_IN_SEC),
           **{
-              f"{stat}_{metric['name']}": value
+              f"{'avg' if stat == 'mean' else stat}_{metric['name']}": value
               for metric in self.steps[0]["local_metrics"]
               if "json_field_name" in metric
               for stat, value in metric.items()
               if stat not in ["name", "description", "json_field_name"] and value is not None
           },
-          "server_metrics": [
-                  {"name": server_metric["name"], **server_metric}
-                  for server_metric in self.steps[0]["server_metrics"]
-              ] if self.steps[0]["server_metrics"] is not None else []
+          "server_metrics": {
+              server_metric["name"]: {k.capitalize(): v for k, v in server_metric.items() if k != "name"}
+              for server_metric in self.steps[0]["server_metrics"]
+          } if self.steps[0]["server_metrics"] is not None else {}
       } if len(self.steps) == 1 else None
     }
   

From be0a89e91abea67f7cd0e98fc846da89689f6120 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Wed, 13 Nov 2024 19:01:14 +0000
Subject: [PATCH 25/27] to_dict

---
 .../container/benchmark_serving.py            | 116 ++++++++++--------
 1 file changed, 62 insertions(+), 54 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index c5ec11718..e0989b097 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -37,7 +37,55 @@
 prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)])
 response_length_metric = Histogram("LatencyProfileGenerator:response_length", "Response length", buckets=[2**i for i in range(1, 16)])
 tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request')
+
+class ErrorsReport():
+  ClientConnectorErrors: int
+  TimeoutErrors: int
+  ContentTypeErrors: int
+  ClientOSErrors: int
+  ServerDisconnectedErrors: int
+  unknown_errors: int
+
+  def __init__(self):
+    self.ClientConnectorErrors = 0
+    self.TimeoutErrors = 0
+    self.ContentTypeErrors = 0
+    self.ClientOSErrors = 0
+    self.ServerDisconnectedErrors = 0
+    self.unknown_errors = 0
+
   
+  def to_dict(self) -> dict:
+    return {k: v for k, v in self.__dict__.items() if isinstance(v, int)}
+
+  def record_error(self, error: Exception):
+      if isinstance(error, aiohttp.client_exceptions.ClientConnectorError):
+          self.ClientConnectorErrors += 1
+          print(f"ClientConnectorError: {error}")
+      elif isinstance(error, asyncio.TimeoutError):
+          self.TimeoutErrors += 1
+          print(f"TimeoutError: {error}")
+      elif isinstance(error, aiohttp.client_exceptions.ContentTypeError):
+          self.ContentTypeErrors += 1
+          print(f"ContentTypeError: {error}")
+      elif isinstance(error, aiohttp.client_exceptions.ClientOSError):
+          self.ClientOSErrors += 1
+          print(f"ClientOSError: {error}")
+      elif isinstance(error, aiohttp.client_exceptions.ServerDisconnectedError):
+          self.ServerDisconnectedErrors += 1
+          print(f"ServerDisconnectedError: {error}")
+      else:
+          self.unknown_errors += 1
+          print(f"Unknown error: {error}")
+
+  def append_report(self, report: "ErrorsReport"):
+      self.ClientConnectorErrors += report.ClientConnectorErrors
+      self.TimeoutErrors += report.TimeoutErrors
+      self.ContentTypeErrors += report.ContentTypeErrors
+      self.ClientOSErrors += report.ClientOSErrors
+      self.ServerDisconnectedErrors += report.ServerDisconnectedErrors
+      self.unknown_errors += report.unknown_errors
+     
 class Backend(ABC):
     """
     An abstract base class for Backend that defines the interface
@@ -56,10 +104,10 @@ async def send_request(
         tokenizer: PreTrainedTokenizerBase,
         sax_model: str,
         model: str,
-    ) -> Tuple[Optional[Tuple[int, int, float]], Optional[Dict[str, int]]]:
+    ) -> Tuple[Optional[Tuple[int, int, float]], Optional[ErrorsReport]]:
       """Sends request to server."""
       request_start_time = time.time()
-      errors = init_errors_map()
+      errors = ErrorsReport()
 
       headers = {"User-Agent": "Benchmark Client"}
       pload = self.create_request_payload(
@@ -85,29 +133,8 @@ async def send_request(
             # Re-send the request if it failed.
             if "error" not in output:
               break
-          except aiohttp.client_exceptions.ClientConnectorError as client_err:
-            errors["ClientConnectorError"] += 1
-            print(f"ClientConnectorError: {client_err}")
-            return None, errors
-          except asyncio.TimeoutError as timeout_err:
-            errors["TimeoutError"] += 1
-            print(f"TimeoutError: {timeout_err}")
-            return None, errors
-          except aiohttp.client_exceptions.ClientOSError as e:
-            errors["ClientOSError"] += 1
-            print(f"ClientOSError: {e}")
-            return None, errors
-          except aiohttp.client_exceptions.ContentTypeError as e:
-            print(f"ContentTypeError: {e}, response: {response}")
-            errors["ContentTypeError"] += 1
-            return None, errors
-          except aiohttp.client_exceptions.ServerDisconnectedError as e:
-            errors["ServerDisconnectedError"] += 1
-            print(f"ServerDisconnectedError: {e}")
-            return None, errors
-          except Exception as e: 
-            print(f"Unknown error {e}")
-            errors["unknown_error"] += 1
+          except Exception as e:
+            errors.record_error(e)
             return None, errors
       request_end_time = time.time()
       # Naive HF transformers generation and TensorRT-LLM generation stops at EOS
@@ -379,7 +406,7 @@ class BenchmarkingStepReport(TypedDict):
   latencies: List
   local_metrics: List[MetricSummary]
   server_metrics: Optional[List[MetricSummary]]
-  errors: Dict[str, int]
+  errors: ErrorsReport
 
 class BenchmarkingReport():
   """Results for all steps for a single model"""
@@ -403,7 +430,7 @@ def record_metrics_for_step(
       timestamp_end: float,
       num_prompts_attempted : int, 
       latencies: List,
-      errors: Dict[str, int],
+      errors: ErrorsReport,
       backend: Backend,
     ):
 
@@ -432,7 +459,6 @@ def fetch_metrics_from_gmp(backend: Backend, duration: float) -> List[MetricSumm
         
       metrics_list : List[MetricSummary] = []
       for metric in backend.get_server_metrics():
-        print("Metric Name: %s" % (metric))
 
       # Find metric type
         metric_type = all_metrics_metadata['data'][metric]
@@ -482,7 +508,6 @@ def fetch_metrics_from_gmp(backend: Backend, duration: float) -> List[MetricSumm
             if request_post.ok:
               if response["status"] == "success":
                 metric_results[query_name] = float(response["data"]["result"][0]["value"][1])
-                print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1]))
               else:
                 print("Cloud Monitoring PromQL Error: %s" % (response["error"]))
             else:
@@ -577,8 +602,9 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float]
       server_metrics = server_metrics
     ))
 
-  # Each element in the output list is a report for each step
   def to_text_reports(self, write_to_files: bool = False) -> List[str]:
+    """Each element in the output list is a report for each step"""
+
     output : Dict[str, str] = {}
     required_stats = ["latency", "throughput", "input_length", "output_length", "per_output_token_latency"]
     for step in self.steps:
@@ -598,7 +624,7 @@ def to_text_reports(self, write_to_files: bool = False) -> List[str]:
       total_tokens = total_input_tokens + total_output_tokens
       tokens_per_min = 60 * total_tokens / total_time
       step_output.append(f"====Result for Model: {self.config['model']}====")
-      step_output.append(f"Errors: {step['errors']}")
+      step_output.append(f"Errors: {step['errors'].to_dict()}")
       step_output.append(f"Total time: {total_time:.2f} s")
       step_output.append(f"Successful/total requests: {len(step['latencies'])}/{step['num_prompts_attempted']}")
       step_output.append(f"Requests/min: {60 * step['num_prompts_attempted'] / total_time:.2f}")
@@ -688,17 +714,6 @@ def to_json_report(self, write_to_file: bool = False) -> Dict:
         json.dump(output, outfile)
     return output
 
-def init_errors_map() -> Dict[str, int]:
-  errors = {
-    "ClientConnectorError": 0,
-    "TimeoutError": 0,
-    "ContentTypeError": 0,
-    "ClientOSError": 0,
-    "ServerDisconnectedError": 0,
-    "unknown_error": 0,
-  }
-  return errors
-
 def get_backend(backend: str) -> Backend:
   if backend == "vllm":
     return vLLMBackend()
@@ -822,7 +837,7 @@ async def benchmark(
   for index, step in enumerate(all_steps["steps"]):
   
     # No need to sleep before running the first step
-    if 'time_between_steps' in args.job and index != 0:
+    if args.job is not None and 'time_between_steps' in args.job and index != 0:
       print(f"Sleeping for {args.job['time_between_steps']} sec...")
       await asyncio.sleep(args.job["time_between_steps"])
     max_prompts = f" {step['max_num_prompts']} requests" if 'max_num_prompts' in step else ""
@@ -863,13 +878,13 @@ async def benchmark(
     print(f"Finished benchmarking step {index + 1}")
 
     all_latencies = []
-    all_errors = init_errors_map()
+    all_errors = ErrorsReport()
     for latency, errors in results:
       if latency:
         all_latencies.append(latency)
       if errors:
         for err, count in errors.items():
-          all_errors[err] = all_errors[err] + count
+          all_errors.record_error(err)
     benchmark_results.record_metrics_for_step(step['rate'], step_start_timestamp, step_end_timestamp, prompts_sent_this_step, all_latencies, all_errors, backend)
   
   print(f"Completed all steps, generating reports...")
@@ -886,16 +901,9 @@ def aggregate_benchmark_reports(reports: List[BenchmarkingReport]) -> Benchmarki
     "num_prompts_attempted": 0,
     "latencies": [],
     "server_metrics": [],
-    "errors": {},
+    "errors": ErrorsReport(),
   }
 
-  def accumulate_errors(errors_list: List[Dict[str, int]]) -> Dict[str, int]:
-    accumulated_errors = init_errors_map()
-    for errors in errors_list:
-        for error_type, count in errors.items():
-            accumulated_errors[error_type] += count
-    return accumulated_errors
-
   for report in reports:
     # Input metavalidation asserts this report only has one step report
     report = report.steps[0]
@@ -903,7 +911,7 @@ def accumulate_errors(errors_list: List[Dict[str, int]]) -> Dict[str, int]:
     aggregated_step_report["timestamp_end"] = max(aggregated_step_report["timestamp_end"], report["timestamp_end"])
     aggregated_step_report["num_prompts_attempted"] += report["num_prompts_attempted"]
     aggregated_step_report["latencies"].extend(report["latencies"])
-    aggregated_step_report["errors"] = accumulate_errors([aggregated_step_report["errors"], report["errors"]])
+    aggregated_step_report["errors"] = aggregated_step_report["errors"].append_report(report["errors"])
 
   aggregated_report = BenchmarkingReport(reports[0].args, f"ALL-{len(reports)}-MODELS", aggregated_step_report["timestamp_start"])
   aggregated_report.record_metrics_for_step(**aggregated_step_report)

From 6d9b0611cc7a5b9e83c3782f2a7f5f28092fb246 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Wed, 13 Nov 2024 22:48:57 +0000
Subject: [PATCH 26/27] streaming changes

---
 .../container/benchmark_serving.py            | 236 ++++++++----------
 1 file changed, 104 insertions(+), 132 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index afa56834c..10fcbf1c1 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -38,6 +38,7 @@
 prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)])
 response_length_metric = Histogram("LatencyProfileGenerator:response_length", "Response length", buckets=[2**i for i in range(1, 16)])
 tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request')
+ttft_metric = Histogram('LatencyProfileGenerator:time_to_first_token', 'Time to first token per request')
 active_requests_metric = Gauge('LatencyProfileGenerator:active_requests', 'How many requests actively being processed')
 
 # Add trace config for monitoring in flight requests
@@ -108,90 +109,6 @@ class Backend(ABC):
     An abstract base class for Backend that defines the interface
     for new model server backends.
     """
-    
-    async def send_stream_request(
-        self,
-        backend: str,
-        api_url: str,
-        prompt: str,
-        prompt_len: int,
-        output_len: int,
-        best_of: int,
-        use_beam_search: bool,
-        top_k: int,
-        tokenizer: PreTrainedTokenizerBase,
-        sax_model: str,
-        model: str,
-    ) -> Tuple[Optional[Tuple[int, int, float]], Optional[float],  Optional[ErrorsReport]]:
-      """Sends stream request to server"""
-      request_start_time = time.time()
-      errors = init_errors_map()
-
-      headers = {"User-Agent": "Benchmark Client"}
-      if backend == "vllm":
-        pload = {
-            "model": model,
-            "prompt": prompt,
-            "n": 1,
-            "best_of": best_of,
-            "use_beam_search": use_beam_search,
-            "temperature": 0.0 if use_beam_search else 1.0,
-            "top_p": 1.0,
-            "max_tokens": output_len,
-            "ignore_eos": True,
-            "stream": True,
-        }
-      else: 
-        raise ValueError(f"Unknown backend: {backend}")
-
-      ttft = 0.0
-      st = time.perf_counter()
-      output = ""
-      timeout = aiohttp.ClientTimeout(total=CLIENT_TIMEOUT_SEC)
-      async with aiohttp.ClientSession(timeout=timeout,trust_env=True) as session:
-        try:
-          async with session.post(api_url, headers=headers, json=pload, ssl=False) as response:
-            async for chunk_bytes in response.content.iter_chunks():
-              chunk_bytes = chunk_bytes[0].strip()
-              if not chunk_bytes:
-                  continue
-              timestamp = time.perf_counter()
-              # First token
-              if ttft == 0.0:
-                ttft = timestamp - st
-
-              if chunk_bytes.decode("utf-8")[6:] != "[DONE]":
-                  if backend == "vllm":
-                    output += json.loads(chunk_bytes.decode("utf-8")[6:])["choices"][0]["text"]
-        except aiohttp.client_exceptions.ClientConnectorError as client_err:
-          errors["ClientConnectorError"] += 1
-          print(f"ClientConnectorError: {client_err}")
-          return None, None, errors
-        except asyncio.TimeoutError as timeout_err:
-          errors["TimeoutError"] += 1
-          print(f"TimeoutError: {timeout_err}")
-          return None, None, errors
-        except aiohttp.client_exceptions.ClientOSError as e:
-          errors["ClientOSError"] += 1
-          print(f"ClientOSError: {e}")
-          return None, None, errors
-        except aiohttp.client_exceptions.ContentTypeError as e:
-          print(f"ContentTypeError: {e}, response: {response}")
-          errors["ContentTypeError"] += 1
-          return None, None, errors
-        except aiohttp.client_exceptions.ServerDisconnectedError as e:
-          errors["ServerDisconnectedError"] += 1
-          print(f"ServerDisconnectedError: {e}")
-          return None, None, errors
-        except Exception as e: 
-          print(f"Unknown error {e}")
-          errors["unknown_error"] += 1
-          return None, None, errors
-      request_end_time = time.time()
-      output_token_ids = tokenizer(output).input_ids
-      output_len = len(output_token_ids)
-      request_latency = (prompt_len, output_len, (request_end_time - request_start_time))
-      return request_latency, ttft, None
 
     async def send_request(
         self,
@@ -205,7 +122,8 @@ async def send_request(
         tokenizer: PreTrainedTokenizerBase,
         sax_model: str,
         model: str,
-    ) -> Tuple[Optional[Tuple[int, int, float]], Optional[float],  Optional[ErrorsReport]]
+        streaming: bool,
+    ) -> Tuple[Optional[Tuple[int, int, float]], Optional[float],  Optional[ErrorsReport]]:
       """Sends request to server."""
       request_start_time = time.time()
       errors = ErrorsReport()
@@ -220,22 +138,25 @@ async def send_request(
         tokenizer=tokenizer,
         sax_model=sax_model,
         model=model,
+        streaming=streaming
       )
       
       # Set client timeout to be 3 hrs.
       timeout = aiohttp.ClientTimeout(total=CLIENT_TIMEOUT_SEC)
+      start_time = time.perf_counter()
+      output = ""
+      ttft = 0.0
       async with aiohttp.ClientSession(timeout=timeout,trust_env=True) as session:
         while True:
           try:
             async with session.post(f"{api_url}/{self.get_endpoint()}", headers=headers, json=pload, ssl=False) as response:
-              output = await response.json()
-
+              output, ttft = await self.results_from_response(response, streaming, start_time)
             # Re-send the request if it failed.
             if "error" not in output:
               break
           except Exception as e:
             errors.record_error(e)
-            return None, errors
+            return None, None, errors
       request_end_time = time.time()
       # Naive HF transformers generation and TensorRT-LLM generation stops at EOS
       # tokens and the generation may be shorter than the ground-truth output
@@ -251,8 +172,10 @@ async def send_request(
       tpot_metric.observe((request_end_time - request_start_time) / output_len)
       prompt_length_metric.observe(prompt_len)
       response_length_metric.observe(output_len)
+      if ttft is not None:
+        ttft_metric.observe(ttft)
 
-      return request_latency, None
+      return request_latency, ttft, None
 
     @abstractmethod
     def create_request_payload(self,
@@ -264,7 +187,8 @@ def create_request_payload(self,
       top_k: int,
       tokenizer: PreTrainedTokenizerBase,
       sax_model: str,
-      model: str) -> Dict:
+      model: str,
+      streaming: bool) -> Dict:
         pass
 
     @abstractmethod
@@ -283,6 +207,12 @@ def get_server_metrics(self) -> List[str]:
     def get_endpoint(self) -> str:
       pass
 
+    async def results_from_response(self, response: aiohttp.ClientResponse, streaming: bool, start_time: float) -> Tuple[Dict, Optional[float]]:
+      if streaming:
+        raise Exception("This backend does not support parsing streaming responses")
+      else:
+        return await response.json()   
+
 class vLLMBackend(Backend):
   def get_server_metrics(self) -> List[str]:
     return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"]
@@ -297,7 +227,8 @@ def create_request_payload(self,
       top_k: int,
       tokenizer: PreTrainedTokenizerBase,
       sax_model: str,
-      model: str):
+      model: str,
+      streaming: bool):
     return {
         "model": model,
         "prompt": prompt,
@@ -308,7 +239,7 @@ def create_request_payload(self,
         "top_p": 1.0,
         "max_tokens": output_len,
         "ignore_eos": False,
-        "stream": False,
+        "stream": streaming,
     }
   def get_response_length(
         self, 
@@ -317,6 +248,34 @@ def get_response_length(
         tokenizer: PreTrainedTokenizerBase):
     output_token_ids = tokenizer(response["choices"][0]["text"]).input_ids
     return len(output_token_ids)
+  async def results_from_response(self, response: aiohttp.ClientResponse, streaming: bool, start_time: float) -> Tuple[Dict, Optional[float]]:
+    ttft = 0.0
+
+    # Make a streaming response look like a non streaming response for detokenizing later
+    output = {
+       'choices': [{
+          'text' : ""
+       }]
+    }
+    if streaming:
+      async for chunk_bytes in response.content.iter_chunks():
+        chunk_bytes = chunk_bytes[0].strip()
+        if not chunk_bytes:
+          continue
+          
+        timestamp = time.perf_counter()
+          
+        # Calculate Time-to-First-Token (TTFT)
+        if ttft == 0.0:
+          ttft = timestamp - start_time
+
+        # Process the chunk if it's not the "[DONE]" message
+        if chunk_bytes.decode("utf-8")[6:] != "[DONE]":
+          output["choices"][0]["text"] += json.loads(chunk_bytes.decode("utf-8")[6:])["choices"][0]["text"]
+      return output, ttft
+    else:
+      res = await response.json()
+      return res, None
 
 class JetstreamBackend(Backend):
   def get_server_metrics(self) -> List[str]:
@@ -335,7 +294,8 @@ def create_request_payload(self,
       top_k: int,
       tokenizer: PreTrainedTokenizerBase,
       sax_model: str,
-      model: str):
+      model: str,
+      streaming: bool):
     return {
         "prompt": prompt,
         "max_tokens": output_len,
@@ -362,7 +322,8 @@ def create_request_payload(self,
       top_k: int,
       tokenizer: PreTrainedTokenizerBase,
       sax_model: str,
-      model: str):
+      model: str,
+      streaming: bool):
     return {
       "inputs": prompt,
       "parameters": {
@@ -393,7 +354,8 @@ def create_request_payload(self,
       top_k: int,
       tokenizer: PreTrainedTokenizerBase,
       sax_model: str,
-      model: str):
+      model: str,
+      streaming: bool):
     return {
         "instances": [{
             "prompt": prompt,
@@ -426,7 +388,8 @@ def create_request_payload(self,
       top_k: int,
       tokenizer: PreTrainedTokenizerBase,
       sax_model: str,
-      model: str):
+      model: str,
+      streaming: bool):
     return {
         "text_input": prompt,
         "max_tokens": output_len,
@@ -435,7 +398,7 @@ def create_request_payload(self,
         "top_p": 1.0,
         "bad_words": "",
         "stop_words": "",
-        "stream": False,
+        "stream": streaming,
     }
   def get_response_length(
         self, 
@@ -459,7 +422,8 @@ def create_request_payload(self,
       top_k: int,
       tokenizer: PreTrainedTokenizerBase,
       sax_model: str,
-      model: str):
+      model: str,
+      streaming: bool):
     return {
         "model": sax_model,
         "prompt": prompt,
@@ -470,7 +434,7 @@ def create_request_payload(self,
         "top_p": 1.0,
         "top_k": 50,
         "max_tokens": output_len,
-        "stream": False,
+        "stream": streaming,
     }
   def get_response_length(
         self, 
@@ -531,6 +495,7 @@ def record_metrics_for_step(
       timestamp_end: float,
       num_prompts_attempted : int, 
       latencies: List,
+      ttfts: List[float],
       errors: ErrorsReport,
       backend: Backend,
     ):
@@ -664,15 +629,7 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float]
     if self.args.scrape_server_metrics:
       server_metrics = fetch_metrics_from_gmp(backend, total_time)
 
-    self.steps.append(BenchmarkingStepReport(
-      request_rate = request_rate,
-      timestamp_start = timestamp_start,
-      timestamp_end = timestamp_end,
-      num_prompts_attempted = num_prompts_attempted,
-      latencies = latencies,
-      ttfts = ttfts,
-      errors = errors,
-      local_metrics = [
+    local_metrics = [
         metric_sumamry_from_points( 
           name="per_token_latency", 
           description="seconds/token (includes waiting time on server)", 
@@ -687,11 +644,7 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float]
           name="per_output_token_latency", 
           description="milliseconds/output_token (includes waiting time on server)", 
           points=[1000 * latency / output_len for _, output_len, latency in latencies]),
-         metric_sumamry_from_points(
-          json_field_name="ttft", 
-          name="time_to_first_token", 
-          description="time to first token in seconds (includes waiting time on server)", 
-          points=[1000 * latency / output_len for _, output_len, latency in latencies]),
+       
         metric_sumamry_from_points(
           name="input_length", 
           description="length of prompt", 
@@ -705,7 +658,24 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float]
           description = "throughput in requests per second",
           mean = (len(latencies) / ((timestamp_end - timestamp_start) / NS_IN_SEC)),
         ),
-      ],
+      ]
+    if self.args.stream_request:
+      local_metrics.append(metric_sumamry_from_points(
+        json_field_name="ttft", 
+        name="time_to_first_token", 
+        description="Time to First Token (s)", 
+        points=ttfts)
+      )
+
+    self.steps.append(BenchmarkingStepReport(
+      request_rate = request_rate,
+      timestamp_start = timestamp_start,
+      timestamp_end = timestamp_end,
+      num_prompts_attempted = num_prompts_attempted,
+      latencies = latencies,
+      ttfts = ttfts,
+      errors = errors,
+      local_metrics=local_metrics,
       server_metrics = server_metrics
     ))
 
@@ -823,8 +793,8 @@ def to_json_report(self, write_to_file: bool = False) -> Dict:
       with open(file_name, "w", encoding="utf-8") as outfile:
         json.dump(output, outfile)
       if gcs_bucket is not None:
-        gcs_bucket.blob(f"{args.output_bucket_filepath}/{file_name}").upload_from_filename(file_name)
-        print(f"File {file_name} uploaded to gs://{args.output_bucket}/{args.output_bucket_filepath}")
+        gcs_bucket.blob(f"{self.args.output_bucket_filepath}/{file_name}").upload_from_filename(file_name)
+        print(f"File {file_name} uploaded to gs://{self.args.output_bucket}/{self.args.output_bucket_filepath}")
     return output
 
 
@@ -947,7 +917,7 @@ async def benchmark(
           "max_num_prompts": args.num_prompts,
        }]
     }
-   benchmark_results = BenchmarkingReport(args, model, time.time_ns())
+  benchmark_results = BenchmarkingReport(args, model, time.time_ns())
   for index, step in enumerate(all_steps["steps"]):
     # No need to sleep before running the first step
     if args.job is not None and 'time_between_steps' in args.job and index != 0:
@@ -969,18 +939,19 @@ async def benchmark(
         
       prompt, prompt_len, output_len = request
       task = asyncio.create_task(
-          backend.send_request(
-              f"http://{args.host}:{args.port}",
-              prompt,
-              prompt_len,
-              output_len,
-              args.best_of,
-              args.use_beam_search,
-              args.top_k,
-              tokenizer,
-              args.sax_model,
-              model,
-          )
+        backend.send_request(
+          f"http://{args.host}:{args.port}",
+          prompt,
+          prompt_len,
+          output_len,
+          args.best_of,
+          args.use_beam_search,
+          args.top_k,
+          tokenizer,
+          args.sax_model,
+          model,
+          args.stream_request,  
+        )
       )
       tasks.append(task)
       prompts_sent_this_step += 1
@@ -993,12 +964,11 @@ async def benchmark(
     all_latencies = []
     all_ttfts = []
     all_errors = ErrorsReport()
-    for latency, errors in results:
+    for latency, ttft, errors in results:
       if latency:
         all_latencies.append(latency)
       if errors:
-        for err, count in errors.items():
-          all_errors.record_error(err)
+        all_errors.append_report(errors)
       if ttft:
         all_ttfts.append(ttft)
     benchmark_results.record_metrics_for_step(step['rate'], step_start_timestamp, step_end_timestamp, prompts_sent_this_step, all_latencies, all_ttfts, all_errors, backend)
@@ -1016,6 +986,7 @@ def aggregate_benchmark_reports(reports: List[BenchmarkingReport]) -> Benchmarki
     "timestamp_end": 0.0,
     "num_prompts_attempted": 0,
     "latencies": [],
+    "ttfts": [],
     "server_metrics": [],
     "errors": ErrorsReport(),
   }
@@ -1027,6 +998,7 @@ def aggregate_benchmark_reports(reports: List[BenchmarkingReport]) -> Benchmarki
     aggregated_step_report["timestamp_end"] = max(aggregated_step_report["timestamp_end"], report["timestamp_end"])
     aggregated_step_report["num_prompts_attempted"] += report["num_prompts_attempted"]
     aggregated_step_report["latencies"].extend(report["latencies"])
+    aggregated_step_report["ttfts"].extend(report["ttfts"])
     aggregated_step_report["errors"] = aggregated_step_report["errors"].append_report(report["errors"])
 
   aggregated_report = BenchmarkingReport(reports[0].args, f"ALL-{len(reports)}-MODELS", aggregated_step_report["timestamp_start"])

From a9f620ffb135535a69832249e5357217ae0c2af4 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Wed, 13 Nov 2024 22:57:27 +0000
Subject: [PATCH 27/27] step -> stage

---
 .../container/benchmark_serving.py            | 198 +++++++++---------
 1 file changed, 99 insertions(+), 99 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index 10fcbf1c1..3bad2c9f9 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -461,8 +461,8 @@ class MetricSummary(TypedDict, total=False):
   p90:               Optional[float]
   p99:               Optional[float]
 
-class BenchmarkingStepReport(TypedDict):
-  """Result for one step"""
+class BenchmarkingStageReport(TypedDict):
+  """Result for one stage"""
   request_rate: float
   timestamp_start: float
   timestamp_end: float
@@ -474,10 +474,10 @@ class BenchmarkingStepReport(TypedDict):
   errors: ErrorsReport
 
 class BenchmarkingReport():
-  """Results for all steps for a single model"""
+  """Results for all stages for a single model"""
   args: argparse.Namespace
   config: BenchmarkConfig
-  steps: List[BenchmarkingStepReport]
+  stages: List[BenchmarkingStageReport]
   
   def __init__(self, args : argparse.Namespace, model: str, start_time: float):
     self.args = args
@@ -486,9 +486,9 @@ def __init__(self, args : argparse.Namespace, model: str, start_time: float):
       model_server  = args.backend, 
       start_time    = start_time
     )
-    self.steps = []
+    self.stages = []
 
-  def record_metrics_for_step(
+  def record_metrics_for_stage(
       self,
       request_rate: float, 
       timestamp_start: float,
@@ -667,7 +667,7 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float]
         points=ttfts)
       )
 
-    self.steps.append(BenchmarkingStepReport(
+    self.stages.append(BenchmarkingStageReport(
       request_rate = request_rate,
       timestamp_start = timestamp_start,
       timestamp_end = timestamp_end,
@@ -680,43 +680,43 @@ def metric_sumamry_from_points(name: str, description: str, points : List[float]
     ))
 
   def to_text_reports(self, write_to_files: bool = False) -> List[str]:
-    """Each element in the output list is a report for each step"""
+    """Each element in the output list is a report for each stage"""
 
     output : Dict[str, str] = {}
     required_stats = ["latency", "throughput", "input_length", "output_length", "per_output_token_latency"]
-    for step in self.steps:
-     if not all(required_stat in [metric['name'] for metric in step['local_metrics']] for required_stat in required_stats):
+    for stage in self.stages:
+     if not all(required_stat in [metric['name'] for metric in stage['local_metrics']] for required_stat in required_stats):
         raise Exception(f"All of the following stats must be recorded: {required_stats}")
      
-    for step in self.steps:
-      step_output : List[str] = []
-      total_time = (step['timestamp_end'] - step['timestamp_start']) / NS_IN_SEC
-      total_output_tokens = np.sum([output_len for _, output_len, _ in step['latencies']])
+    for stage in self.stages:
+      stage_output : List[str] = []
+      total_time = (stage['timestamp_end'] - stage['timestamp_start']) / NS_IN_SEC
+      total_output_tokens = np.sum([output_len for _, output_len, _ in stage['latencies']])
       output_tokens_per_second = total_output_tokens / total_time
       output_tokens_per_min = 60 * output_tokens_per_second
 
-      total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in step['latencies']])
+      total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in stage['latencies']])
       input_tokens_per_min = 60 * total_input_tokens / total_time
 
       total_tokens = total_input_tokens + total_output_tokens
       tokens_per_min = 60 * total_tokens / total_time
-      step_output.append(f"====Result for Model: {self.config['model']}====")
-      step_output.append(f"Errors: {step['errors'].to_dict()}")
-      step_output.append(f"Total time: {total_time:.2f} s")
-      step_output.append(f"Successful/total requests: {len(step['latencies'])}/{step['num_prompts_attempted']}")
-      step_output.append(f"Requests/min: {60 * step['num_prompts_attempted'] / total_time:.2f}")
-      step_output.append(f"Output_tokens/min: {output_tokens_per_min:.2f}")
-      step_output.append(f"Input_tokens/min: {input_tokens_per_min:.2f}")
-      step_output.append(f"Tokens/min: {tokens_per_min:.2f}")
+      stage_output.append(f"====Result for Model: {self.config['model']}====")
+      stage_output.append(f"Errors: {stage['errors'].to_dict()}")
+      stage_output.append(f"Total time: {total_time:.2f} s")
+      stage_output.append(f"Successful/total requests: {len(stage['latencies'])}/{stage['num_prompts_attempted']}")
+      stage_output.append(f"Requests/min: {60 * stage['num_prompts_attempted'] / total_time:.2f}")
+      stage_output.append(f"Output_tokens/min: {output_tokens_per_min:.2f}")
+      stage_output.append(f"Input_tokens/min: {input_tokens_per_min:.2f}")
+      stage_output.append(f"Tokens/min: {tokens_per_min:.2f}")
 
       if self.args.machine_cost:
-          step_output.append(
+          stage_output.append(
               f"Cost $/1k tokens: {self.args.machine_cost * 1000 / (60 * output_tokens_per_min)}"
           )
-      for metric in step['local_metrics']:
-        step_output.append(f"Average {metric['description']}:" f" {metric['mean']:.2f}")
-      output_filename = f"latency-profile-{datetime.fromtimestamp(step['timestamp_start'] / NS_IN_SEC).strftime('%Y-%m-%d_%H-%M-%S')}.txt"
-      output[output_filename] = '\n'.join(step_output)
+      for metric in stage['local_metrics']:
+        stage_output.append(f"Average {metric['description']}:" f" {metric['mean']:.2f}")
+      output_filename = f"latency-profile-{datetime.fromtimestamp(stage['timestamp_start'] / NS_IN_SEC).strftime('%Y-%m-%d_%H-%M-%S')}.txt"
+      output[output_filename] = '\n'.join(stage_output)
       if write_to_files:
         with open(output_filename, 'w') as file:
           file.write(output[output_filename])
@@ -725,34 +725,34 @@ def to_text_reports(self, write_to_files: bool = False) -> List[str]:
           print(f"File {output_filename} uploaded to gs://{args.output_bucket}/{args.output_bucket_filepath}")
     return list(output.values())
 
-  # The output is a a single json summary of all steps
+  # The output is a a single json summary of all stages
   def to_json_report(self, write_to_file: bool = False) -> Dict:
     output = {
       "config": {
          **self.config,
         "num_models":  len(self.args.models) if self.args.save_aggregated_result else 1,
         "start_time": {
-          "seconds" : self.steps[0]["timestamp_start"] // NS_IN_SEC,
-          "nanos" : self.steps[0]["timestamp_start"] % NS_IN_SEC,
+          "seconds" : self.stages[0]["timestamp_start"] // NS_IN_SEC,
+          "nanos" : self.stages[0]["timestamp_start"] % NS_IN_SEC,
         },
       },
       "summary_stats": {
         "stats": [
             {
-              "request_rate": step["request_rate"],
+              "request_rate": stage["request_rate"],
               **{(metric["json_field_name"] if "json_field_name" in metric else metric["name"]): {
                       stat: value
                       for stat, value in metric.items()
                       if stat not in ["name", "description", "json_field_name"] and value is not None
                   }
-                  for metric in step["local_metrics"]
+                  for metric in stage["local_metrics"]
               },
               "model_server_metrics": [
                   {"name": server_metric["name"], **server_metric}
-                  for server_metric in step["server_metrics"]
-              ] if step["server_metrics"] is not None else []
+                  for server_metric in stage["server_metrics"]
+              ] if stage["server_metrics"] is not None else []
             }
-            for step in self.steps
+            for stage in self.stages
         ]
       },
 
@@ -766,23 +766,23 @@ def to_json_report(self, write_to_file: bool = False) -> Dict:
       # Legacy use case, use summary_stats if possible
       "metrics": {
           # Traffic metrics
-          "num_prompts": self.steps[0]['num_prompts_attempted'],
-          "request_rate": self.steps[0]['request_rate'],
-          "benchmark_time": (self.steps[0]['timestamp_end'] - self.steps[0]['timestamp_start']) / NS_IN_SEC,
-          "throughput_rps": (len(self.steps[0]['latencies']) / ((self.steps[0]['timestamp_end'] - self.steps[0]['timestamp_start']) / NS_IN_SEC)),
-          "throughput": np.sum([output_len for _, output_len, _ in self.steps[0]['latencies']]) / ((self.steps[0]['timestamp_end'] - self.steps[0]['timestamp_start']) / NS_IN_SEC),
+          "num_prompts": self.stages[0]['num_prompts_attempted'],
+          "request_rate": self.stages[0]['request_rate'],
+          "benchmark_time": (self.stages[0]['timestamp_end'] - self.stages[0]['timestamp_start']) / NS_IN_SEC,
+          "throughput_rps": (len(self.stages[0]['latencies']) / ((self.stages[0]['timestamp_end'] - self.stages[0]['timestamp_start']) / NS_IN_SEC)),
+          "throughput": np.sum([output_len for _, output_len, _ in self.stages[0]['latencies']]) / ((self.stages[0]['timestamp_end'] - self.stages[0]['timestamp_start']) / NS_IN_SEC),
           **{
               f"{'avg' if stat == 'mean' else stat}_{metric['name']}": value
-              for metric in self.steps[0]["local_metrics"]
+              for metric in self.stages[0]["local_metrics"]
               if "json_field_name" in metric
               for stat, value in metric.items()
               if stat not in ["name", "description", "json_field_name"] and value is not None
           },
           "server_metrics": {
               server_metric["name"]: {k.capitalize(): v for k, v in server_metric.items() if k != "name"}
-              for server_metric in self.steps[0]["server_metrics"]
-          } if self.steps[0]["server_metrics"] is not None else {}
-      } if len(self.steps) == 1 else None
+              for server_metric in self.stages[0]["server_metrics"]
+          } if self.stages[0]["server_metrics"] is not None else {}
+      } if len(self.stages) == 1 else None
     }
   
     if write_to_file:
@@ -907,34 +907,34 @@ async def benchmark(
       args.use_dummy_text,
   )
   
-  all_steps = {}
+  all_stages = {}
   if args.job is not None:
-    all_steps = args.job
+    all_stages = args.job
   elif args.num_prompts is not None:
-    all_steps = {
-      "steps": [{
+    all_stages = {
+      "stages": [{
           "rate": args.request_rate,
           "max_num_prompts": args.num_prompts,
        }]
     }
   benchmark_results = BenchmarkingReport(args, model, time.time_ns())
-  for index, step in enumerate(all_steps["steps"]):
-    # No need to sleep before running the first step
-    if args.job is not None and 'time_between_steps' in args.job and index != 0:
-      print(f"Sleeping for {args.job['time_between_steps']} sec...")
-      await asyncio.sleep(args.job["time_between_steps"])
-    max_prompts = f" {step['max_num_prompts']} requests" if 'max_num_prompts' in step else ""
-    duration = f" {step['time']} sec" if 'time' in step else " "
-    print(f"Starting benchmarking{max_prompts} at {step['rate']} requests/sec for{duration}")
+  for index, stage in enumerate(all_stages["stages"]):
+    # No need to sleep before running the first stage
+    if args.job is not None and 'time_between_stages' in args.job and index != 0:
+      print(f"Sleeping for {args.job['time_between_stages']} sec...")
+      await asyncio.sleep(args.job["time_between_stages"])
+    max_prompts = f" {stage['max_num_prompts']} requests" if 'max_num_prompts' in stage else ""
+    duration = f" {stage['time']} sec" if 'time' in stage else " "
+    print(f"Starting benchmarking{max_prompts} at {stage['rate']} requests/sec for{duration}")
 
     tasks: List[asyncio.Task] = []
-    prompts_sent_this_step: int = 0
-    step_start_timestamp = time.time_ns()
-    async for request in generate_next_request(input_requests, str(step["rate"]), step_start_timestamp):
+    prompts_sent_this_stage: int = 0
+    stage_start_timestamp = time.time_ns()
+    async for request in generate_next_request(input_requests, str(stage["rate"]), stage_start_timestamp):
       # Stop conditions
-      if "max_num_prompts" in step and prompts_sent_this_step >= step["max_num_prompts"]:
+      if "max_num_prompts" in stage and prompts_sent_this_stage >= stage["max_num_prompts"]:
         break
-      if "time" in step and ((time.time_ns() - step_start_timestamp ) / NS_IN_SEC) > step["time"]:
+      if "time" in stage and ((time.time_ns() - stage_start_timestamp ) / NS_IN_SEC) > stage["time"]:
         break
         
       prompt, prompt_len, output_len = request
@@ -954,12 +954,12 @@ async def benchmark(
         )
       )
       tasks.append(task)
-      prompts_sent_this_step += 1
+      prompts_sent_this_stage += 1
 
     print("All requests sent, awaiting responses...")
     results = await asyncio.gather(*tasks)
-    step_end_timestamp = time.time_ns()
-    print(f"Finished benchmarking step {index + 1}")
+    stage_end_timestamp = time.time_ns()
+    print(f"Finished benchmarking stage {index + 1}")
 
     all_latencies = []
     all_ttfts = []
@@ -971,17 +971,17 @@ async def benchmark(
         all_errors.append_report(errors)
       if ttft:
         all_ttfts.append(ttft)
-    benchmark_results.record_metrics_for_step(step['rate'], step_start_timestamp, step_end_timestamp, prompts_sent_this_step, all_latencies, all_ttfts, all_errors, backend)
+    benchmark_results.record_metrics_for_stage(stage['rate'], stage_start_timestamp, stage_end_timestamp, prompts_sent_this_stage, all_latencies, all_ttfts, all_errors, backend)
   
-  print(f"Completed all steps, generating reports...")
+  print(f"Completed all stages, generating reports...")
   return benchmark_results
 
 def aggregate_benchmark_reports(reports: List[BenchmarkingReport]) -> BenchmarkingReport: 
   """When benchmarking multiple models we will generate a BenchmarkingReport for each."""
   """If `save_aggregated_result` is set, we aggregate these into a single report."""
 
-  aggregated_step_report = {
-    "request_rate":  reports[0].steps[0]["request_rate"],
+  aggregated_stage_report = {
+    "request_rate":  reports[0].stages[0]["request_rate"],
     "timestamp_start": 0.0,
     "timestamp_end": 0.0,
     "num_prompts_attempted": 0,
@@ -992,17 +992,17 @@ def aggregate_benchmark_reports(reports: List[BenchmarkingReport]) -> Benchmarki
   }
 
   for report in reports:
-    # Input metavalidation asserts this report only has one step report
-    report = report.steps[0]
-    aggregated_step_report["timestamp_start"] = min(aggregated_step_report["timestamp_start"], report["timestamp_start"])
-    aggregated_step_report["timestamp_end"] = max(aggregated_step_report["timestamp_end"], report["timestamp_end"])
-    aggregated_step_report["num_prompts_attempted"] += report["num_prompts_attempted"]
-    aggregated_step_report["latencies"].extend(report["latencies"])
-    aggregated_step_report["ttfts"].extend(report["ttfts"])
-    aggregated_step_report["errors"] = aggregated_step_report["errors"].append_report(report["errors"])
-
-  aggregated_report = BenchmarkingReport(reports[0].args, f"ALL-{len(reports)}-MODELS", aggregated_step_report["timestamp_start"])
-  aggregated_report.record_metrics_for_step(**aggregated_step_report)
+    # Input metavalidation asserts this report only has one stage report
+    report = report.stages[0]
+    aggregated_stage_report["timestamp_start"] = min(aggregated_stage_report["timestamp_start"], report["timestamp_start"])
+    aggregated_stage_report["timestamp_end"] = max(aggregated_stage_report["timestamp_end"], report["timestamp_end"])
+    aggregated_stage_report["num_prompts_attempted"] += report["num_prompts_attempted"]
+    aggregated_stage_report["latencies"].extend(report["latencies"])
+    aggregated_stage_report["ttfts"].extend(report["ttfts"])
+    aggregated_stage_report["errors"] = aggregated_stage_report["errors"].append_report(report["errors"])
+
+  aggregated_report = BenchmarkingReport(reports[0].args, f"ALL-{len(reports)}-MODELS", aggregated_stage_report["timestamp_start"])
+  aggregated_report.record_metrics_for_stage(**aggregated_stage_report)
 
   return aggregated_report
 
@@ -1055,7 +1055,7 @@ def input_metavalidation(args: argparse.Namespace):
     raise ValueError("All args must be set for one and only one of the following sets of arguments: {--request-rate, --num-prompts} or {--job}")
 
   if args.save_aggregated_result and args.benchmark is not None and len(args.benchmark) != 1 and args.models is not None and len(args.models) > 1:
-      raise ValueError("Multi model benchmarking with multi step benchmarking is not supported yet")
+      raise ValueError("Multi model benchmarking with multi stage benchmarking is not supported yet")
 
   if args.use_beam_search and args.backend == "tgi":
     raise ValueError("Beam search is not supported by TGI")
@@ -1182,33 +1182,33 @@ def parse_request_rates(input_str):
           request_data = json.loads(input_str)
           # Validate that the JSON has the correct structure
           if not isinstance(request_data, dict):
-              raise argparse.ArgumentTypeError("Input JSON must be an object containing 'time_between_steps' and 'steps'.")
-          # Check 'time_between_steps' field
-          if "time_between_steps" not in request_data or (not isinstance(request_data["time_between_steps"], float) and not isinstance(request_data["time_between_steps"], int)):
-              raise argparse.ArgumentTypeError("'time_between_steps' must be a float or int.")
-          # Check 'steps' field
-          if "steps" not in request_data or not isinstance(request_data["steps"], list):
-              raise argparse.ArgumentTypeError("'steps' must be a list of objects with 'rate' and 'time'.")
+              raise argparse.ArgumentTypeError("Input JSON must be an object containing 'time_between_stages' and 'stages'.")
+          # Check 'time_between_stages' field
+          if "time_between_stages" not in request_data or (not isinstance(request_data["time_between_stages"], float) and not isinstance(request_data["time_between_stages"], int)):
+              raise argparse.ArgumentTypeError("'time_between_stages' must be a float or int.")
+          # Check 'stages' field
+          if "stages" not in request_data or not isinstance(request_data["stages"], list):
+              raise argparse.ArgumentTypeError("'stages' must be a list of objects with 'rate' and 'time'.")
           
-          # Validate each entry in the 'steps' list
-          for i, rate_entry in enumerate(request_data["steps"]):
+          # Validate each entry in the 'stages' list
+          for i, rate_entry in enumerate(request_data["stages"]):
             if not isinstance(rate_entry, dict):
-                raise argparse.ArgumentTypeError(f"Entry {i} in 'steps' must be a JSON object.")
+                raise argparse.ArgumentTypeError(f"Entry {i} in 'stages' must be a JSON object.")
             
             if "rate" not in rate_entry:
-                raise argparse.ArgumentTypeError(f"Entry {i} in 'steps' must have a 'rate' key.")
+                raise argparse.ArgumentTypeError(f"Entry {i} in 'stages' must have a 'rate' key.")
             if "time" not in rate_entry  and "max_num_prompts" not in rate_entry:
-                raise argparse.ArgumentTypeError(f"Entry {i} in 'steps' must have a 'time' and/or 'max_num_prompts' key.")
+                raise argparse.ArgumentTypeError(f"Entry {i} in 'stages' must have a 'time' and/or 'max_num_prompts' key.")
 
             # Validate the 'rate' field to allow for string expressions or floats
             if isinstance(rate_entry["rate"], str):
                 try:
                   is_expression_of_t(rate_entry["rate"])  # Validate the expression
                 except Exception as e:
-                  raise argparse.ArgumentTypeError(f"Entry {i} in 'steps': {e}")
+                  raise argparse.ArgumentTypeError(f"Entry {i} in 'stages': {e}")
             # Validate the 'time' field
             if not isinstance(rate_entry["time"], (float, int)):
-                raise argparse.ArgumentTypeError(f"Entry {i} in 'steps': 'time' must be a positive float.")
+                raise argparse.ArgumentTypeError(f"Entry {i} in 'stages': 'time' must be a positive float.")
           return request_data
       except json.JSONDecodeError as e:
           raise argparse.ArgumentTypeError("Invalid JSON format")
@@ -1223,18 +1223,18 @@ def parse_request_rates(input_str):
           " or as a filename. \n"
           " The JSON should have the following structure:\n\n"
           "     {\n"
-          "         \"time_between_steps\": float (seconds to rest between rates),\n" 
+          "         \"time_between_stages\": float (seconds to rest between rates),\n" 
           "         \"rates\": [\n"
           "             {\n"
           "                 \"rate\": float | str (as would be passed to request-rate),\n"
-          "                 \"time\": float (number of seconds for this step)\n"
-          "                 \"max_num_prompts\": int (maximum number of prompts for this step)"
+          "                 \"time\": float (number of seconds for this stage)\n"
+          "                 \"max_num_prompts\": int (maximum number of prompts for this stage)"
           "             },\n"
           "             ...\n"
           "         ]\n"
           "     }\n\n"
           " Example JSON:\n"
-          "     '{\"time_between_steps\": 1.0, \"rates\": [{\"rate\": 2.0, \"time\": 0.0}, {\"rate\": \"1+0.5*t\", \"time\": 5.0}]}'\n\n"
+          "     '{\"time_between_stages\": 1.0, \"rates\": [{\"rate\": 2.0, \"time\": 0.0}, {\"rate\": \"1+0.5*t\", \"time\": 5.0}]}'\n\n"
           " Each entry should have a 'rate' and/or 'num_prompts' and 'time' value."
           " Each rate is finished when \"num_prompts\" prompts are sent"
           " (if specified) and \"time\" seconds have passed (if specified),"