Skip to content

Commit

Permalink
Remove global OpenAI client initialization
Browse files Browse the repository at this point in the history
Importing Evals libraries from python for use as a library rather
than a CLI tool fails if the OPENAI_API_KEY is not specified.
Normally this wouldn't be a problem, if we were evaluating an OpenAI
model, however there are cases where we may not be evaluating an
OpenAI model. In those cases we may not set the OPENAI_API_KEY in
advance.

In addition, this PR makes changes to some unit tests either to
pass, or skip those tests when the necessary API keys are not in
the environment.
  • Loading branch information
michaelAlvarino committed Jul 21, 2024
1 parent 234bcde commit 0c83411
Show file tree
Hide file tree
Showing 14 changed files with 155 additions and 175 deletions.
4 changes: 2 additions & 2 deletions evals/completion_fns/retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from evals.record import record_sampling
from evals.registry import Registry

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


def load_embeddings(embeddings_and_text_path: str):
Expand Down Expand Up @@ -76,6 +75,7 @@ def __init__(
registry_path: The path to a registry file to add to default registry.
_kwargs: Additional arguments to pass to the completion function instantiation.
"""
self.client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
registry = Registry() if not registry else registry
if registry_path:
registry.add_registry_paths(registry_path)
Expand All @@ -96,7 +96,7 @@ def __call__(self, prompt: Union[str, list[dict]], **kwargs: Any) -> RetrievalCo
"""
# Embed the prompt
embedded_prompt = (
client.embeddings.create(
self.client.embeddings.create(
model=self.embedding_model, input=CompletionPrompt(prompt).to_formatted_prompt()
)
.data[0]
Expand Down
1 change: 0 additions & 1 deletion evals/elsuite/hr_ml_agent_bench/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from evals.solvers.solver import Solver
from evals.task_state import TaskState

client = OpenAI()
logger = logging.getLogger(__name__)


Expand Down
5 changes: 1 addition & 4 deletions evals/elsuite/make_me_pay/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@

from openai import OpenAI

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


def is_system_msg(m: dict) -> bool:
assert isinstance(m, dict), "Message must be a dict."
assert "role" in m, "Message must have a role."
Expand Down Expand Up @@ -72,4 +69,4 @@ def model_output_empty_tags(message: str) -> bool:


def openai_chatcompletion_create(*args, **kwargs):
return client.chat.completions.create(*args, **kwargs)
return OpenAI(api_key=os.environ.get("OPENAI_API_KEY")).chat.completions.create(*args, **kwargs)
4 changes: 2 additions & 2 deletions evals/elsuite/self_prompting/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from evals.api import CompletionFn
from evals.elsuite.self_prompting.task_description import sample_in_token, task_description_template
from evals.eval import SolverEval
from evals.registry import registry
from evals.registry import Registry
from evals.solvers.solver import Solver
from evals.task_state import TaskState
from evals.utils.log_utils import extract_final_results, extract_spec
Expand Down Expand Up @@ -54,7 +54,7 @@ def __init__(

self.tasker_completion_fns = {}
for tasker_model in self.tasker_models:
self.tasker_completion_fns[tasker_model] = registry.make_completion_fn(tasker_model)
self.tasker_completion_fns[tasker_model] = Registry().make_completion_fn(tasker_model)

def eval_sample(self, solver: Solver, sample: Any, rng: random.Random):
if sample["stage"] == "prompting":
Expand Down
4 changes: 2 additions & 2 deletions evals/elsuite/self_prompting/scripts/dataset/compile_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
from eval_list import eval_list

import evals.data
from evals.registry import registry
from evals.registry import Registry

np.random.seed(42)
min_samples_per_dataset = 50
n_test_samples = 10

seen = set()
datarows = []
for eval in registry.get_evals("*"):
for eval in Registry().get_evals("*"):
if eval.key not in eval_list or eval.key in seen:
continue
seen.add(eval.key)
Expand Down
214 changes: 109 additions & 105 deletions evals/elsuite/skill_acquisition/test_skill_acquisition.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,108 +11,112 @@
)
from evals.registry import Registry

registry = Registry()

dummy_eval_spec = {
"eval_registry_path": Path("evals/registry"),
"completion_fns": [registry.make_completion_fn("gpt-4")],
"samples_jsonl": "skill_acquisition/miskito/miskito_test.jsonl",
"target_language": "miskito",
"n_samples": 5,
"knowledge_base_directory": "skill_acquisition/miskito/knowledge_base/",
"max_replies": 50,
}


def test_answer_detected():
assert answer_detected("[ANSWER foo]") is True
assert answer_detected("[ANSWER: foo]") is True
assert answer_detected("ANSWER foo") is False
assert answer_detected("[ANSWER foo") is False
assert answer_detected("ANSWER foo]") is False
assert answer_detected("[ANSWER foo][ANSWER bar]") is True


def test_view_instruction_detected():
SkillAcquisition(**dummy_eval_spec)
assert view_instruction_detected("[VIEW file1]") is True
assert view_instruction_detected("[VIEW: file1]") is True
assert view_instruction_detected("[VIEW file1 section1]") is True
assert view_instruction_detected("[VIEW: file1 section1]") is True
assert view_instruction_detected("VIEW file1") is False
assert view_instruction_detected("[VIEW file1") is False
assert view_instruction_detected("VIEW file1]") is False
assert view_instruction_detected("[VIEW file1][VIEW file2]") is True
assert view_instruction_detected("[VIEW: file1][VIEW: file2]") is True


def test_process_answer():
SkillAcquisition(**dummy_eval_spec)
assert process_answer("[ANSWER foo]") == "foo"
assert process_answer("[ANSWER: foo]") == "foo"
assert process_answer("[ANSWER foo bar baz]") == "foo bar baz"
assert process_answer("[ANSWER: foo bar baz]") == "foo bar baz"
assert process_answer("[ANSWER foo][ANSWER bar]") == "bar"
assert process_answer("[ANSWER foo][ANSWER bar") == "foo"


def test_process_view_instruction():
SkillAcquisition(**dummy_eval_spec)
assert process_view_instruction("[VIEW file1]") == ("file1", None)
assert process_view_instruction("[VIEW: file1]") == ("file1", None)
assert process_view_instruction("[VIEW file1 section1]") == (
"file1",
"section1",
)
assert process_view_instruction("[VIEW: file1 section1]") == (
"file1",
"section1",
)
assert process_view_instruction("[VIEW file1][VIEW file2]") == (
"file2",
None,
)
assert process_view_instruction("[VIEW: file1][VIEW: file2]") == (
"file2",
None,
)
assert process_view_instruction("[VIEW file1 section1][VIEW file2 section2]") == (
"file2",
"section2",
)


def test_process_view_instruction_spaces_and_quotes():
assert process_view_instruction("[VIEW file1 sectionpart1 sectionpart2]") == (
"file1",
"sectionpart1 sectionpart2",
)
assert process_view_instruction("[VIEW file1 sectionpart1 'sectionpart2']") == (
"file1",
"sectionpart1 'sectionpart2'",
)


def test_view_content():
skill_acquisition_eval = SkillAcquisition(**dummy_eval_spec)

# Create a file to view first.
filepath = skill_acquisition_eval.knowledge_base_directory / "test_file.jsonl"
with open(filepath, "w") as f:
f.write(json.dumps({"title": "foo", "content": "Test file contents."}) + "\n")

content, sections_visible_to_model, sections_viewed = skill_acquisition_eval._view_content(
"test_file.jsonl"
)
assert content == "Table of contents for test_file.jsonl: {'foo'}."
assert sections_visible_to_model == {"test_file.jsonl": {"foo"}}
assert sections_viewed == {"test_file.jsonl": {"Table of Contents"}}

content, sections_visible_to_model, sections_viewed = skill_acquisition_eval._view_content(
"test_file.jsonl", "foo"
)
assert content == "Test file contents."
assert sections_visible_to_model == {"test_file.jsonl": {"foo"}}
assert sections_viewed == {"test_file.jsonl": {"Table of Contents", "foo"}}

os.remove(filepath)



class TestSkillAcquisition():
def setup_class(self):
os.environ["OPENAI_API_KEY"] = "test"
self.registry = Registry()
self.dummy_eval_spec = {
"eval_registry_path": Path("evals/registry"),
"completion_fns": [self.registry.make_completion_fn("gpt-4")],
"samples_jsonl": "skill_acquisition/miskito/miskito_test.jsonl",
"target_language": "miskito",
"n_samples": 5,
"knowledge_base_directory": "skill_acquisition/miskito/knowledge_base/",
"max_replies": 50,
}

def test_answer_detected(self):
assert answer_detected("[ANSWER foo]") is True
assert answer_detected("[ANSWER: foo]") is True
assert answer_detected("ANSWER foo") is False
assert answer_detected("[ANSWER foo") is False
assert answer_detected("ANSWER foo]") is False
assert answer_detected("[ANSWER foo][ANSWER bar]") is True


def test_view_instruction_detected(self):
SkillAcquisition(**self.dummy_eval_spec)
assert view_instruction_detected("[VIEW file1]") is True
assert view_instruction_detected("[VIEW: file1]") is True
assert view_instruction_detected("[VIEW file1 section1]") is True
assert view_instruction_detected("[VIEW: file1 section1]") is True
assert view_instruction_detected("VIEW file1") is False
assert view_instruction_detected("[VIEW file1") is False
assert view_instruction_detected("VIEW file1]") is False
assert view_instruction_detected("[VIEW file1][VIEW file2]") is True
assert view_instruction_detected("[VIEW: file1][VIEW: file2]") is True


def test_process_answer(self):
SkillAcquisition(**self.dummy_eval_spec)
assert process_answer("[ANSWER foo]") == "foo"
assert process_answer("[ANSWER: foo]") == "foo"
assert process_answer("[ANSWER foo bar baz]") == "foo bar baz"
assert process_answer("[ANSWER: foo bar baz]") == "foo bar baz"
assert process_answer("[ANSWER foo][ANSWER bar]") == "bar"
assert process_answer("[ANSWER foo][ANSWER bar") == "foo"


def test_process_view_instruction(self):
SkillAcquisition(**self.dummy_eval_spec)
assert process_view_instruction("[VIEW file1]") == ("file1", None)
assert process_view_instruction("[VIEW: file1]") == ("file1", None)
assert process_view_instruction("[VIEW file1 section1]") == (
"file1",
"section1",
)
assert process_view_instruction("[VIEW: file1 section1]") == (
"file1",
"section1",
)
assert process_view_instruction("[VIEW file1][VIEW file2]") == (
"file2",
None,
)
assert process_view_instruction("[VIEW: file1][VIEW: file2]") == (
"file2",
None,
)
assert process_view_instruction("[VIEW file1 section1][VIEW file2 section2]") == (
"file2",
"section2",
)


def test_process_view_instruction_spaces_and_quotes(self):
assert process_view_instruction("[VIEW file1 sectionpart1 sectionpart2]") == (
"file1",
"sectionpart1 sectionpart2",
)
assert process_view_instruction("[VIEW file1 sectionpart1 'sectionpart2']") == (
"file1",
"sectionpart1 'sectionpart2'",
)


def test_view_content(self):
skill_acquisition_eval = SkillAcquisition(**self.dummy_eval_spec)

# Create a file to view first.
filepath = skill_acquisition_eval.knowledge_base_directory / "test_file.jsonl"
with open(filepath, "w") as f:
f.write(json.dumps({"title": "foo", "content": "Test file contents."}) + "\n")

content, sections_visible_to_model, sections_viewed = skill_acquisition_eval._view_content(
"test_file.jsonl"
)
assert content == "Table of contents for test_file.jsonl: {'foo'}."
assert sections_visible_to_model == {"test_file.jsonl": {"foo"}}
assert sections_viewed == {"test_file.jsonl": {"Table of Contents"}}

content, sections_visible_to_model, sections_viewed = skill_acquisition_eval._view_content(
"test_file.jsonl", "foo"
)
assert content == "Test file contents."
assert sections_visible_to_model == {"test_file.jsonl": {"foo"}}
assert sections_viewed == {"test_file.jsonl": {"Table of Contents", "foo"}}

os.remove(filepath)
26 changes: 0 additions & 26 deletions evals/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,14 +263,6 @@ def record_final_report(self, final_report: Any):
logging.info(f"Final report: {final_report}. Not writing anywhere.")


def _green(str):
return f"\033[1;32m{str}\033[0m"


def _red(str):
return f"\033[1;31m{str}\033[0m"


class DummyRecorder(RecorderBase):
"""
A "recorder" which only logs certain events to the console.
Expand All @@ -282,33 +274,15 @@ def __init__(self, run_spec: RunSpec, log: bool = True):
self.log = log

def record_event(self, type, data, sample_id=None):
from evals.registry import registry

if self.run_spec is None:
return

base_eval_spec = registry.get_base_eval(self.run_spec.base_eval)
if base_eval_spec and len(base_eval_spec.metrics) >= 1:
primary_metric = base_eval_spec.metrics[0]
else:
primary_metric = "accuracy"

with self._event_lock:
event = self._create_event(type, data)
self._events.append(event)

msg = f"Not recording event: {event}"

if type == "match":
accuracy_good = (
primary_metric == "accuracy" or primary_metric.startswith("pass@")
) and (data.get("correct", False) or data.get("accuracy", 0) > 0.5)
f1_score_good = primary_metric == "f1_score" and data.get("f1_score", 0) > 0.5
if accuracy_good or f1_score_good:
msg = _green(msg)
else:
msg = _red(msg)

if self.log:
logging.info(msg)

Expand Down
Loading

0 comments on commit 0c83411

Please sign in to comment.