Hello.
After some research, I found that Weave supports tracking the evaluation and benchmark between framework/model. But I got some confusions as it always go with @weave.op
In my understanding, this is a decorator to trace input and output of a function. But I already used Langfuse to trace input, output, latency, cost of LLM. All I want now is to track evaluation and benchmark prompts, config framework… and give a visualized report. I couldn’t find a pipeline example to do so :?
class Evaluator:
def __init__(logger: WandbLogger):
async def __call__(self):
evaluation_data_path = os.getenv("EVALUATION_DATA_PATH")
evaluation_data = self.load_evaluation_data(evaluation_data_path)
evaluation_data = {key: value[:3] for key, value in evaluation_data.items()}
prediction_dict = {
key: await self.score_generator(
question=[item["question"] for item in evaluation_data[key]],
essay=[item["essay"] for item in evaluation_data[key]],
task_name=TaskName.TASK_2,
do_sample=False,
temperature=0)
for key in evaluation_data
}
evaluation_results = {}
for key, predictions in prediction_dict.items():
predictions = pd.DataFrame(predictions)
golds = pd.DataFrame([item["human_rate"] for item in evaluation_data[key]])
evaluation_results[key] = {
criterion: self.evaluate_criterion(
predictions=predictions[criterion].tolist(), golds=golds[criterion].tolist()
)
for criterion in golds.columns
}
self.logger.log(evaluation_result=evaluation_results,
config=self.framework_config)
class WandbLogger:
def __init__(self):
wandb.login(key=os.getenv("WANDB_API_KEY"))
self.api = wandb.Api()
self.entity = os.getenv("WANDB_ENTITY")
self.project = os.getenv("WANDB_PROJECT_NAME")
self.run = None
def __start_run(self, config):
self.run = wandb.init(
entity=self.entity,
project=self.project,
config=config
)
def __end_run(self):
if self.run is not None:
self.run.finish()
def get_all_logs(self):
"""Fetch all the experiments in the project"""
records = []
runs = self.api.runs(f"{self.entity}/{self.project}")
for run in runs:
records.append(run.history())
return records
def log(self, evaluation_result, config):
self.__start_run(config)
self.run.summary(evaluation_result)
self.__end_run()