Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Functionary Solver #37

Open
wants to merge 26 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 129 additions & 0 deletions evals/cli/oaieval-remote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""
Modal-based remote execution of evals with GPU support.
"""
import argparse
import os
import sys
from typing import Optional, cast

import modal

# Create a Modal app
app = modal.App("oaieval-remote")

# Assumes you've set up this secret in Modal
secrets = [
modal.Secret.from_name("openai-secret"),
modal.Secret.from_name("huggingface-secret"),
]

# Create a base image with evals and all its dependencies
evals_base_image = (
modal.Image.debian_slim(python_version="3.12.2")
.apt_install("git")
.pip_install("git+https://github.com/fixie-ai/evals.git", "accelerate")
)

def get_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Run evals remotely through Modal")
parser.add_argument(
"completion_fn",
type=str,
help="One or more CompletionFn URLs, separated by commas (,)",
)
parser.add_argument("eval", type=str, help="Name of an eval. See registry.")
parser.add_argument("--commit_hash", type=str, help="Git commit hash to use for the eval code")
parser.add_argument(
"--completion_args",
type=str,
default="",
help="Specify additional parameters for completion_fn (e.g., 'key1=value1,key2=value2')",
)
parser.add_argument("--extra_eval_params", type=str, default="")
parser.add_argument("--max_samples", type=int, default=None)
parser.add_argument("--cache", action=argparse.BooleanOptionalAction, default=True)
parser.add_argument("--seed", type=int, default=20220722)
parser.add_argument("--registry_path", type=str, default=None, action="append")
parser.add_argument("--debug", action=argparse.BooleanOptionalAction, default=False)
parser.add_argument("--visible", action=argparse.BooleanOptionalAction, default=None,
help="Whether to show samples during evaluation")
parser.add_argument("--user", type=str, default=os.getenv("USER", "unknown"),
help="User running the eval")
parser.add_argument("--log_to_file", type=str, default=None,
help="Log to a file instead of stdout")
return parser

def create_image(commit_hash: Optional[str] = None) -> modal.Image:
if not commit_hash:
return evals_base_image

# Allow new dependencies to be installed with the commit
return evals_base_image.pip_install(f"git+https://github.com/fixie-ai/evals.git@{commit_hash}", "accelerate")

# Define the function with base image and environment variables
@app.function(
timeout=21600,
container_idle_timeout=60,
image=create_image(),
secrets=secrets,
gpu=modal.gpu.H100(count=4),
volumes={
"/root/.cache/huggingface": modal.Volume.from_name("hf-cache", create_if_missing=True)
}
)
def run_eval(args_dict: dict) -> None:
import os
import sys
print("Python path:", sys.path)
print("HF cache contents:", os.listdir("/root/.cache/huggingface"))

from evals.cli.oaieval import run, OaiEvalArguments

# Remove Modal-specific args before passing to oaieval
args_dict.pop("commit_hash", None)
args_dict.pop("gpu", None)

# Ensure all required fields from OaiEvalArguments exist with defaults
defaults = {
"user": "", # Default empty string as per oaieval.py
"record_path": None,
"log_to_file": None,
"local_run": True,
"http_run": False,
"http_run_url": None,
"http_batch_size": 100,
"http_fail_percent_threshold": 5,
"dry_run": False,
"dry_run_logging": True,
}

# Update args with defaults for any missing fields
for key, value in defaults.items():
if key not in args_dict:
args_dict[key] = value

# Convert dict back to namespace
args = argparse.Namespace(**args_dict)

# Use the original oaieval run function
run(cast(OaiEvalArguments, args))

def main() -> None:
parser = get_parser()
args = parser.parse_args()

args_dict = vars(args)

# Create image with specific commit
if args.commit_hash:
run_eval.image = create_image(args.commit_hash)

# Run the function with output enabled
with modal.enable_output():
with app.run():
run_eval.remote(args_dict)

if __name__ == "__main__":
main()

#generation/gpu/ultravox-functionary-70b audio-translate-covost-en_de --commit_hash 755cb93b6e98abe4abef849b942fa34ac83a52af
7 changes: 6 additions & 1 deletion evals/cli/oaieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def to_number(x: str) -> Union[int, float, str]:
**extra_eval_params,
)
result = eval.run(recorder)
add_token_usage_to_result(result, recorder)
# add_token_usage_to_result(result, recorder)
recorder.record_final_report(result)

if not (args.dry_run or args.local_run):
Expand All @@ -233,6 +233,11 @@ def to_number(x: str) -> Union[int, float, str]:
logger.info("Final report:")
for key, value in result.items():
logger.info(f"{key}: {value}")

print("Final Report:")
for key, value in result.items():
print(f"{key}: {value}")

return run_spec.run_id


Expand Down
9 changes: 9 additions & 0 deletions evals/registry/solvers/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,15 @@ human_cli:

# generation tasks

generation/direct/gpt-4o:
class: evals.solvers.providers.openai.openai_solver:OpenAISolver
args:
completion_fn_options:
model: gpt-4o
extra_options:
temperature: 0
max_tokens: 512

generation/direct/gpt-3.5-turbo:
class: evals.solvers.providers.openai.openai_solver:OpenAISolver
args:
Expand Down
53 changes: 50 additions & 3 deletions evals/registry/solvers/fixie.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ generation/direct/ultravox-70b:
completion_fn_options:
model: fixie-ai/ultravox-70B
extra_options:
temperature: 1.0
temperature: 0
# empirically, setting frequency_penalty to 1.0 seems to get us closer to the HF results
# but it's not clear why this is necessary
# TODO: investigate and try to match HF/VLLM results without this
Expand All @@ -16,16 +16,63 @@ generation/direct/ultravox-8b:
completion_fn_options:
model: fixie-ai/ultravox-8B
extra_options:
temperature: 1.0
temperature: 0
# empirically, setting frequency_penalty to 1.0 seems to get us closer to the HF results
# but it's not clear why this is necessary
# TODO: investigate and try to match HF/VLLM results without this
frequency_penalty: 1.0

generation/gpu/ultravox-hermes-70b:
class: evals.solvers.providers.fixie.fixie_solver:FixieSolver
args:
completion_fn_options:
model: fixie-ai/ultravox-hermes-70B-dev
extra_options:
temperature: 0
frequency_penalty: 1.0


generation/gpu/ultravox-dev:
class: evals.solvers.providers.fixie.local_gpu_solver:FixieGPUSolver
args:
model: fixie-ai/ultravox-dev
extra_options:
temperature: 0
max_new_tokens: 512
max_new_tokens: 256


generation/gpu/ultravox-v0_4_1-llama-3_1-70b:
class: evals.solvers.providers.fixie.local_gpu_solver:FixieGPUSolver
args:
model: fixie-ai/ultravox-v0_4_1-llama-3_1-70b
extra_options:
temperature: 0
max_new_tokens: 256


generation/gpu/ultravox-v0_4_1-llama-3_1-8b:
class: evals.solvers.providers.fixie.local_gpu_solver:FixieGPUSolver
args:
model: fixie-ai/ultravox-v0_4_1-llama-3_1-8b
extra_options:
temperature: 0
max_new_tokens: 256


generation/gpu/ultravox-hermes-8b:
class: evals.solvers.providers.fixie.local_gpu_solver:FixieGPUSolver
args:
model: fixie-ai/ultravox-v0_4_1-Hermes-3-Llama-3_1-8B
extra_options:
temperature: 0
max_new_tokens: 256



generation/gpu/ultravox-functionary-70b:
class: evals.solvers.providers.fixie.local_gpu_solver:FixieGPUSolver
args:
model: fixie-ai/ultravox-v0_4_1-llama-3_1-70b-functionary-medium-v3_1
extra_options:
temperature: 0
max_new_tokens: 256
7 changes: 7 additions & 0 deletions evals/registry/solvers/functionary.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
generation/gpu/functionary-medium:
class: evals.solvers.providers.functionary.local_gpu_solver:FunctionaryGPUSolver
args:
model: meetkai/functionary-medium-v3.1
extra_options:
temperature: 1.0
max_new_tokens: 256
4 changes: 2 additions & 2 deletions evals/registry/solvers/groq.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ generation/direct/llama-3-groq-8b-chat:
completion_fn_options:
model: llama3-groq-8b-8192-tool-use-preview
extra_options:
temperature: 1
temperature: 0
max_tokens: 512

generation/direct/llama-3-groq-70b-chat:
Expand All @@ -31,5 +31,5 @@ generation/direct/llama-3-groq-70b-chat:
completion_fn_options:
model: llama3-groq-70b-8192-tool-use-preview
extra_options:
temperature: 1
temperature: 0
max_tokens: 512
Loading