fixie-ai · liPatrick · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024
diff --git a/evals/cli/oaieval-remote.py b/evals/cli/oaieval-remote.py
@@ -0,0 +1,129 @@
+"""
+Modal-based remote execution of evals with GPU support.
+"""
+import argparse
+import os
+import sys
+from typing import Optional, cast
+
+import modal
+
+# Create a Modal app
+app = modal.App("oaieval-remote")
+
+# Assumes you've set up this secret in Modal
+secrets = [
+    modal.Secret.from_name("openai-secret"),  
+    modal.Secret.from_name("huggingface-secret"),
+]
+
+# Create a base image with evals and all its dependencies
+evals_base_image = (
+    modal.Image.debian_slim(python_version="3.12.2")
+    .apt_install("git")
+    .pip_install("git+https://github.com/fixie-ai/evals.git", "accelerate")
+)
+
+def get_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Run evals remotely through Modal")
+    parser.add_argument(
+        "completion_fn",
+        type=str,
+        help="One or more CompletionFn URLs, separated by commas (,)",
+    )
+    parser.add_argument("eval", type=str, help="Name of an eval. See registry.")
+    parser.add_argument("--commit_hash", type=str, help="Git commit hash to use for the eval code")
+    parser.add_argument(
+        "--completion_args",
+        type=str,
+        default="",
+        help="Specify additional parameters for completion_fn (e.g., 'key1=value1,key2=value2')",
+    )
+    parser.add_argument("--extra_eval_params", type=str, default="")
+    parser.add_argument("--max_samples", type=int, default=None)
+    parser.add_argument("--cache", action=argparse.BooleanOptionalAction, default=True)
+    parser.add_argument("--seed", type=int, default=20220722)
+    parser.add_argument("--registry_path", type=str, default=None, action="append")
+    parser.add_argument("--debug", action=argparse.BooleanOptionalAction, default=False)
+    parser.add_argument("--visible", action=argparse.BooleanOptionalAction, default=None,
+                       help="Whether to show samples during evaluation")
+    parser.add_argument("--user", type=str, default=os.getenv("USER", "unknown"),
+                       help="User running the eval")
+    parser.add_argument("--log_to_file", type=str, default=None,
+                       help="Log to a file instead of stdout")
+    return parser
+
+def create_image(commit_hash: Optional[str] = None) -> modal.Image:
+    if not commit_hash:
+        return evals_base_image
+
+    # Allow new dependencies to be installed with the commit
+    return evals_base_image.pip_install(f"git+https://github.com/fixie-ai/evals.git@{commit_hash}", "accelerate")
+
+# Define the function with base image and environment variables
+@app.function(
+    timeout=21600,
+    container_idle_timeout=60,
+    image=create_image(),
+    secrets=secrets,
+    gpu=modal.gpu.H100(count=4),
+    volumes={
+        "/root/.cache/huggingface": modal.Volume.from_name("hf-cache", create_if_missing=True)
+    }
+)
+def run_eval(args_dict: dict) -> None:
+    import os
+    import sys
+    print("Python path:", sys.path)
+    print("HF cache contents:", os.listdir("/root/.cache/huggingface"))
+
+    from evals.cli.oaieval import run, OaiEvalArguments
+
+    # Remove Modal-specific args before passing to oaieval
+    args_dict.pop("commit_hash", None)
+    args_dict.pop("gpu", None)
+
+    # Ensure all required fields from OaiEvalArguments exist with defaults
+    defaults = {
+        "user": "",  # Default empty string as per oaieval.py
+        "record_path": None,
+        "log_to_file": None,
+        "local_run": True,
+        "http_run": False,
+        "http_run_url": None,
+        "http_batch_size": 100,
+        "http_fail_percent_threshold": 5,
+        "dry_run": False,
+        "dry_run_logging": True,
+    }
+
+    # Update args with defaults for any missing fields
+    for key, value in defaults.items():
+        if key not in args_dict:
+            args_dict[key] = value
+
+    # Convert dict back to namespace
+    args = argparse.Namespace(**args_dict)
+
+    # Use the original oaieval run function
+    run(cast(OaiEvalArguments, args))
+
+def main() -> None:
+    parser = get_parser()
+    args = parser.parse_args()
+
+    args_dict = vars(args)
+
+    # Create image with specific commit
+    if args.commit_hash:
+        run_eval.image = create_image(args.commit_hash)
+
+    # Run the function with output enabled
+    with modal.enable_output():
+        with app.run():
+            run_eval.remote(args_dict)
+
+if __name__ == "__main__":
+    main()
+
+#generation/gpu/ultravox-functionary-70b audio-translate-covost-en_de --commit_hash 755cb93b6e98abe4abef849b942fa34ac83a52af                        
diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py
@@ -224,7 +224,7 @@ def to_number(x: str) -> Union[int, float, str]:
         **extra_eval_params,
     )
     result = eval.run(recorder)
-    add_token_usage_to_result(result, recorder)
+    # add_token_usage_to_result(result, recorder)
     recorder.record_final_report(result)
 
     if not (args.dry_run or args.local_run):
@@ -233,6 +233,11 @@ def to_number(x: str) -> Union[int, float, str]:
     logger.info("Final report:")
     for key, value in result.items():
         logger.info(f"{key}: {value}")
+
+    print("Final Report:")
+    for key, value in result.items():
+        print(f"{key}: {value}")
+
     return run_spec.run_id
 
 

diff --git a/evals/registry/solvers/defaults.yaml b/evals/registry/solvers/defaults.yaml
@@ -13,6 +13,15 @@ human_cli:
 
 # generation tasks
 
+generation/direct/gpt-4o:
+  class: evals.solvers.providers.openai.openai_solver:OpenAISolver
+  args:
+    completion_fn_options:
+      model: gpt-4o
+      extra_options:
+        temperature: 0
+        max_tokens: 512
+
 generation/direct/gpt-3.5-turbo:
   class: evals.solvers.providers.openai.openai_solver:OpenAISolver
   args:

diff --git a/evals/registry/solvers/fixie.yaml b/evals/registry/solvers/fixie.yaml
@@ -4,7 +4,7 @@ generation/direct/ultravox-70b:
     completion_fn_options:
       model: fixie-ai/ultravox-70B
       extra_options:
-        temperature: 1.0
+        temperature: 0
         # empirically, setting frequency_penalty to 1.0 seems to get us closer to the HF results
         # but it's not clear why this is necessary
         # TODO: investigate and try to match HF/VLLM results without this
@@ -16,16 +16,63 @@ generation/direct/ultravox-8b:
     completion_fn_options:
       model: fixie-ai/ultravox-8B
       extra_options:
-        temperature: 1.0
+        temperature: 0
         # empirically, setting frequency_penalty to 1.0 seems to get us closer to the HF results
         # but it's not clear why this is necessary
         # TODO: investigate and try to match HF/VLLM results without this
         frequency_penalty: 1.0
 
+generation/gpu/ultravox-hermes-70b: 
+  class: evals.solvers.providers.fixie.fixie_solver:FixieSolver
+  args:
+    completion_fn_options:
+      model: fixie-ai/ultravox-hermes-70B-dev
+      extra_options:
+        temperature: 0
+        frequency_penalty: 1.0
+
+
 generation/gpu/ultravox-dev:
   class: evals.solvers.providers.fixie.local_gpu_solver:FixieGPUSolver
   args:
     model: fixie-ai/ultravox-dev
     extra_options:
       temperature: 0
-      max_new_tokens: 512
+      max_new_tokens: 256
+
+
+generation/gpu/ultravox-v0_4_1-llama-3_1-70b: 
+  class: evals.solvers.providers.fixie.local_gpu_solver:FixieGPUSolver
+  args:
+    model: fixie-ai/ultravox-v0_4_1-llama-3_1-70b
+    extra_options:
+      temperature: 0
+      max_new_tokens: 256
+
+
+generation/gpu/ultravox-v0_4_1-llama-3_1-8b:
+  class: evals.solvers.providers.fixie.local_gpu_solver:FixieGPUSolver
+  args:
+    model: fixie-ai/ultravox-v0_4_1-llama-3_1-8b
+    extra_options:
+      temperature: 0
+      max_new_tokens: 256
+
+
+generation/gpu/ultravox-hermes-8b: 
+  class: evals.solvers.providers.fixie.local_gpu_solver:FixieGPUSolver
+  args:
+    model: fixie-ai/ultravox-v0_4_1-Hermes-3-Llama-3_1-8B
+    extra_options:
+      temperature: 0
+      max_new_tokens: 256
+
+
+
+generation/gpu/ultravox-functionary-70b: 
+  class: evals.solvers.providers.fixie.local_gpu_solver:FixieGPUSolver
+  args:
+    model: fixie-ai/ultravox-v0_4_1-llama-3_1-70b-functionary-medium-v3_1
+    extra_options:
+      temperature: 0
+      max_new_tokens: 256
diff --git a/evals/registry/solvers/functionary.yaml b/evals/registry/solvers/functionary.yaml
@@ -0,0 +1,7 @@
+generation/gpu/functionary-medium:
+  class: evals.solvers.providers.functionary.local_gpu_solver:FunctionaryGPUSolver
+  args:
+    model: meetkai/functionary-medium-v3.1
+    extra_options:
+      temperature: 1.0
+      max_new_tokens: 256
diff --git a/evals/registry/solvers/groq.yaml b/evals/registry/solvers/groq.yaml
@@ -22,7 +22,7 @@ generation/direct/llama-3-groq-8b-chat:
     completion_fn_options:
       model: llama3-groq-8b-8192-tool-use-preview
       extra_options:
-        temperature: 1
+        temperature: 0
         max_tokens: 512
 
 generation/direct/llama-3-groq-70b-chat:
@@ -31,5 +31,5 @@ generation/direct/llama-3-groq-70b-chat:
     completion_fn_options:
       model: llama3-groq-70b-8192-tool-use-preview
       extra_options:
-        temperature: 1
+        temperature: 0
         max_tokens: 512