classReportCase(BaseModel):"""A single case in an evaluation report."""name:str"""The name of the [case][pydantic_evals.Case]."""inputs:Any"""The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""metadata:Any"""Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""expected_output:Any"""The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""output:Any"""The output of the task execution."""metrics:dict[str,float|int]attributes:dict[str,Any]scores:dict[str,EvaluationResult[int|float]]=field(init=False)labels:dict[str,EvaluationResult[str]]=field(init=False)assertions:dict[str,EvaluationResult[bool]]=field(init=False)task_duration:floattotal_duration:float# includes evaluator execution time# TODO(DavidM): Drop these once we can reference child spans in details panel:trace_id:strspan_id:str
classReportCaseAggregate(BaseModel):"""A synthetic case that summarizes a set of cases."""name:strscores:dict[str,float|int]labels:dict[str,dict[str,float]]metrics:dict[str,float|int]assertions:float|Nonetask_duration:floattotal_duration:float@staticmethoddefaverage(cases:list[ReportCase])->ReportCaseAggregate:"""Produce a synthetic "summary" case by averaging quantitative attributes."""num_cases=len(cases)ifnum_cases==0:returnReportCaseAggregate(name='Averages',scores={},labels={},metrics={},assertions=None,task_duration=0.0,total_duration=0.0,)def_scores_averages(scores_by_name:list[dict[str,int|float|bool]])->dict[str,float]:counts_by_name:dict[str,int]=defaultdict(int)sums_by_name:dict[str,float]=defaultdict(float)forsbninscores_by_name:forname,scoreinsbn.items():counts_by_name[name]+=1sums_by_name[name]+=scorereturn{name:sums_by_name[name]/counts_by_name[name]fornameinsums_by_name}def_labels_averages(labels_by_name:list[dict[str,str]])->dict[str,dict[str,float]]:counts_by_name:dict[str,int]=defaultdict(int)sums_by_name:dict[str,dict[str,float]]=defaultdict(lambda:defaultdict(float))forlbninlabels_by_name:forname,labelinlbn.items():counts_by_name[name]+=1sums_by_name[name][label]+=1return{name:{value:count/counts_by_name[name]forvalue,countinsums_by_name[name].items()}fornameinsums_by_name}average_task_duration=sum(case.task_durationforcaseincases)/num_casesaverage_total_duration=sum(case.total_durationforcaseincases)/num_cases# average_assertions: dict[str, float] = _scores_averages([{k: v.value for k, v in case.scores.items()} for case in cases])average_scores:dict[str,float]=_scores_averages([{k:v.valuefork,vincase.scores.items()}forcaseincases])average_labels:dict[str,dict[str,float]]=_labels_averages([{k:v.valuefork,vincase.labels.items()}forcaseincases])average_metrics:dict[str,float]=_scores_averages([case.metricsforcaseincases])average_assertions:float|None=Nonen_assertions=sum(len(case.assertions)forcaseincases)ifn_assertions>0:n_passing=sum(1forcaseincasesforassertionincase.assertions.values()ifassertion.value)average_assertions=n_passing/n_assertionsreturnReportCaseAggregate(name='Averages',scores=average_scores,labels=average_labels,metrics=average_metrics,assertions=average_assertions,task_duration=average_task_duration,total_duration=average_total_duration,)
@staticmethoddefaverage(cases:list[ReportCase])->ReportCaseAggregate:"""Produce a synthetic "summary" case by averaging quantitative attributes."""num_cases=len(cases)ifnum_cases==0:returnReportCaseAggregate(name='Averages',scores={},labels={},metrics={},assertions=None,task_duration=0.0,total_duration=0.0,)def_scores_averages(scores_by_name:list[dict[str,int|float|bool]])->dict[str,float]:counts_by_name:dict[str,int]=defaultdict(int)sums_by_name:dict[str,float]=defaultdict(float)forsbninscores_by_name:forname,scoreinsbn.items():counts_by_name[name]+=1sums_by_name[name]+=scorereturn{name:sums_by_name[name]/counts_by_name[name]fornameinsums_by_name}def_labels_averages(labels_by_name:list[dict[str,str]])->dict[str,dict[str,float]]:counts_by_name:dict[str,int]=defaultdict(int)sums_by_name:dict[str,dict[str,float]]=defaultdict(lambda:defaultdict(float))forlbninlabels_by_name:forname,labelinlbn.items():counts_by_name[name]+=1sums_by_name[name][label]+=1return{name:{value:count/counts_by_name[name]forvalue,countinsums_by_name[name].items()}fornameinsums_by_name}average_task_duration=sum(case.task_durationforcaseincases)/num_casesaverage_total_duration=sum(case.total_durationforcaseincases)/num_cases# average_assertions: dict[str, float] = _scores_averages([{k: v.value for k, v in case.scores.items()} for case in cases])average_scores:dict[str,float]=_scores_averages([{k:v.valuefork,vincase.scores.items()}forcaseincases])average_labels:dict[str,dict[str,float]]=_labels_averages([{k:v.valuefork,vincase.labels.items()}forcaseincases])average_metrics:dict[str,float]=_scores_averages([case.metricsforcaseincases])average_assertions:float|None=Nonen_assertions=sum(len(case.assertions)forcaseincases)ifn_assertions>0:n_passing=sum(1forcaseincasesforassertionincase.assertions.values()ifassertion.value)average_assertions=n_passing/n_assertionsreturnReportCaseAggregate(name='Averages',scores=average_scores,labels=average_labels,metrics=average_metrics,assertions=average_assertions,task_duration=average_task_duration,total_duration=average_total_duration,)
classEvaluationReport(BaseModel):"""A report of the results of evaluating a model on a set of cases."""name:str"""The name of the report."""cases:list[ReportCase]"""The cases in the report."""defaverages(self)->ReportCaseAggregate:returnReportCaseAggregate.average(self.cases)defprint(self,width:int|None=None,baseline:EvaluationReport|None=None,include_input:bool=False,include_metadata:bool=False,include_expected_output:bool=False,include_output:bool=False,include_durations:bool=True,include_total_duration:bool=False,include_removed_cases:bool=False,include_averages:bool=True,input_config:RenderValueConfig|None=None,metadata_config:RenderValueConfig|None=None,output_config:RenderValueConfig|None=None,score_configs:dict[str,RenderNumberConfig]|None=None,label_configs:dict[str,RenderValueConfig]|None=None,metric_configs:dict[str,RenderNumberConfig]|None=None,duration_config:RenderNumberConfig|None=None,):# pragma: no cover"""Print this report to the console, optionally comparing it to a baseline report. If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`. """table=self.console_table(baseline=baseline,include_input=include_input,include_metadata=include_metadata,include_expected_output=include_expected_output,include_output=include_output,include_durations=include_durations,include_total_duration=include_total_duration,include_removed_cases=include_removed_cases,include_averages=include_averages,input_config=input_config,metadata_config=metadata_config,output_config=output_config,score_configs=score_configs,label_configs=label_configs,metric_configs=metric_configs,duration_config=duration_config,)Console(width=width).print(table)defconsole_table(self,baseline:EvaluationReport|None=None,include_input:bool=False,include_metadata:bool=False,include_expected_output:bool=False,include_output:bool=False,include_durations:bool=True,include_total_duration:bool=False,include_removed_cases:bool=False,include_averages:bool=True,input_config:RenderValueConfig|None=None,metadata_config:RenderValueConfig|None=None,output_config:RenderValueConfig|None=None,score_configs:dict[str,RenderNumberConfig]|None=None,label_configs:dict[str,RenderValueConfig]|None=None,metric_configs:dict[str,RenderNumberConfig]|None=None,duration_config:RenderNumberConfig|None=None,)->Table:"""Return a table containing the data from this report, or the diff between this report and a baseline report. Optionally include input and output details. """renderer=EvaluationRenderer(include_input=include_input,include_metadata=include_metadata,include_expected_output=include_expected_output,include_output=include_output,include_durations=include_durations,include_total_duration=include_total_duration,include_removed_cases=include_removed_cases,include_averages=include_averages,input_config={**_DEFAULT_VALUE_CONFIG,**(input_configor{})},metadata_config={**_DEFAULT_VALUE_CONFIG,**(metadata_configor{})},output_config=output_configor_DEFAULT_VALUE_CONFIG,score_configs=score_configsor{},label_configs=label_configsor{},metric_configs=metric_configsor{},duration_config=duration_configor_DEFAULT_DURATION_CONFIG,)ifbaselineisNone:returnrenderer.build_table(self)else:# pragma: no coverreturnrenderer.build_diff_table(self,baseline)def__str__(self)->str:"""Return a string representation of the report."""table=self.console_table()io_file=StringIO()Console(file=io_file).print(table)returnio_file.getvalue()
defprint(self,width:int|None=None,baseline:EvaluationReport|None=None,include_input:bool=False,include_metadata:bool=False,include_expected_output:bool=False,include_output:bool=False,include_durations:bool=True,include_total_duration:bool=False,include_removed_cases:bool=False,include_averages:bool=True,input_config:RenderValueConfig|None=None,metadata_config:RenderValueConfig|None=None,output_config:RenderValueConfig|None=None,score_configs:dict[str,RenderNumberConfig]|None=None,label_configs:dict[str,RenderValueConfig]|None=None,metric_configs:dict[str,RenderNumberConfig]|None=None,duration_config:RenderNumberConfig|None=None,):# pragma: no cover"""Print this report to the console, optionally comparing it to a baseline report. If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`. """table=self.console_table(baseline=baseline,include_input=include_input,include_metadata=include_metadata,include_expected_output=include_expected_output,include_output=include_output,include_durations=include_durations,include_total_duration=include_total_duration,include_removed_cases=include_removed_cases,include_averages=include_averages,input_config=input_config,metadata_config=metadata_config,output_config=output_config,score_configs=score_configs,label_configs=label_configs,metric_configs=metric_configs,duration_config=duration_config,)Console(width=width).print(table)
defconsole_table(self,baseline:EvaluationReport|None=None,include_input:bool=False,include_metadata:bool=False,include_expected_output:bool=False,include_output:bool=False,include_durations:bool=True,include_total_duration:bool=False,include_removed_cases:bool=False,include_averages:bool=True,input_config:RenderValueConfig|None=None,metadata_config:RenderValueConfig|None=None,output_config:RenderValueConfig|None=None,score_configs:dict[str,RenderNumberConfig]|None=None,label_configs:dict[str,RenderValueConfig]|None=None,metric_configs:dict[str,RenderNumberConfig]|None=None,duration_config:RenderNumberConfig|None=None,)->Table:"""Return a table containing the data from this report, or the diff between this report and a baseline report. Optionally include input and output details. """renderer=EvaluationRenderer(include_input=include_input,include_metadata=include_metadata,include_expected_output=include_expected_output,include_output=include_output,include_durations=include_durations,include_total_duration=include_total_duration,include_removed_cases=include_removed_cases,include_averages=include_averages,input_config={**_DEFAULT_VALUE_CONFIG,**(input_configor{})},metadata_config={**_DEFAULT_VALUE_CONFIG,**(metadata_configor{})},output_config=output_configor_DEFAULT_VALUE_CONFIG,score_configs=score_configsor{},label_configs=label_configsor{},metric_configs=metric_configsor{},duration_config=duration_configor_DEFAULT_DURATION_CONFIG,)ifbaselineisNone:returnrenderer.build_table(self)else:# pragma: no coverreturnrenderer.build_diff_table(self,baseline)
Source code in pydantic_evals/pydantic_evals/reporting/__init__.py
245246247248249250
def__str__(self)->str:"""Return a string representation of the report."""table=self.console_table()io_file=StringIO()Console(file=io_file).print(table)returnio_file.getvalue()
A configuration for rendering a values in an Evaluation report.
Source code in pydantic_evals/pydantic_evals/reporting/__init__.py
253254255256257258259
classRenderValueConfig(TypedDict,total=False):"""A configuration for rendering a values in an Evaluation report."""value_formatter:str|Callable[[Any],str]diff_checker:Callable[[Any,Any],bool]|Nonediff_formatter:Callable[[Any,Any],str|None]|Nonediff_style:str
classRenderNumberConfig(TypedDict,total=False):"""A configuration for rendering a particular score or metric in an Evaluation report. See the implementation of `_RenderNumber` for more clarity on how these parameters affect the rendering. """value_formatter:str|Callable[[float|int],str]"""The logic to use for formatting values. * If not provided, format as ints if all values are ints, otherwise at least one decimal place and at least four significant figures. * You can also use a custom string format spec, e.g. '{:.3f}' * You can also use a custom function, e.g. lambda x: f'{x:.3f}' """diff_formatter:str|Callable[[float|int,float|int],str|None]|None"""The logic to use for formatting details about the diff. The strings produced by the value_formatter will always be included in the reports, but the diff_formatter is used to produce additional text about the difference between the old and new values, such as the absolute or relative difference. * If not provided, format as ints if all values are ints, otherwise at least one decimal place and at least four significant figures, and will include the percentage change. * You can also use a custom string format spec, e.g. '{:+.3f}' * You can also use a custom function, e.g. lambda x: f'{x:+.3f}'. If this function returns None, no extra diff text will be added. * You can also use None to never generate extra diff text. """diff_atol:float"""The absolute tolerance for considering a difference "significant". A difference is "significant" if `abs(new - old) < self.diff_atol + self.diff_rtol * abs(old)`. If a difference is not significant, it will not have the diff styles applied. Note that we still show both the rendered before and after values in the diff any time they differ, even if the difference is not significant. (If the rendered values are exactly the same, we only show the value once.) If not provided, use 1e-6. """diff_rtol:float"""The relative tolerance for considering a difference "significant". See the description of `diff_atol` for more details about what makes a difference "significant". If not provided, use 0.001 if all values are ints, otherwise 0.05. """diff_increase_style:str"""The style to apply to diffed values that have a significant increase. See the description of `diff_atol` for more details about what makes a difference "significant". If not provided, use green for scores and red for metrics. You can also use arbitrary `rich` styles, such as "bold red". """diff_decrease_style:str"""The style to apply to diffed values that have significant decrease. See the description of `diff_atol` for more details about what makes a difference "significant". If not provided, use red for scores and green for metrics. You can also use arbitrary `rich` styles, such as "bold red". """
The logic to use for formatting details about the diff.
The strings produced by the value_formatter will always be included in the reports, but the diff_formatter is
used to produce additional text about the difference between the old and new values, such as the absolute or
relative difference.
If not provided, format as ints if all values are ints, otherwise at least one decimal place and at least four
significant figures, and will include the percentage change.
You can also use a custom string format spec, e.g. '{:+.3f}'
You can also use a custom function, e.g. lambda x: f'{x:+.3f}'.
If this function returns None, no extra diff text will be added.
You can also use None to never generate extra diff text.
The absolute tolerance for considering a difference "significant".
A difference is "significant" if abs(new - old) < self.diff_atol + self.diff_rtol * abs(old).
If a difference is not significant, it will not have the diff styles applied. Note that we still show
both the rendered before and after values in the diff any time they differ, even if the difference is not
significant. (If the rendered values are exactly the same, we only show the value once.)
@dataclassclassEvaluationRenderer:"""A class for rendering an EvalReport or the diff between two EvalReports."""# Columns to includeinclude_input:boolinclude_metadata:boolinclude_expected_output:boolinclude_output:boolinclude_durations:boolinclude_total_duration:bool# Rows to includeinclude_removed_cases:boolinclude_averages:boolinput_config:RenderValueConfigmetadata_config:RenderValueConfigoutput_config:RenderValueConfigscore_configs:dict[str,RenderNumberConfig]label_configs:dict[str,RenderValueConfig]metric_configs:dict[str,RenderNumberConfig]duration_config:RenderNumberConfigdefinclude_scores(self,report:EvaluationReport,baseline:EvaluationReport|None=None):returnany(case.scoresforcaseinself._all_cases(report,baseline))definclude_labels(self,report:EvaluationReport,baseline:EvaluationReport|None=None):returnany(case.labelsforcaseinself._all_cases(report,baseline))definclude_metrics(self,report:EvaluationReport,baseline:EvaluationReport|None=None):returnany(case.metricsforcaseinself._all_cases(report,baseline))definclude_assertions(self,report:EvaluationReport,baseline:EvaluationReport|None=None):returnany(case.assertionsforcaseinself._all_cases(report,baseline))def_all_cases(self,report:EvaluationReport,baseline:EvaluationReport|None)->list[ReportCase]:ifnotbaseline:returnreport.caseselse:returnreport.cases+self._baseline_cases_to_include(report,baseline)def_baseline_cases_to_include(self,report:EvaluationReport,baseline:EvaluationReport)->list[ReportCase]:ifself.include_removed_cases:returnbaseline.casesreport_case_names={case.nameforcaseinreport.cases}return[caseforcaseinbaseline.casesifcase.nameinreport_case_names]def_get_case_renderer(self,report:EvaluationReport,baseline:EvaluationReport|None=None)->ReportCaseRenderer:input_renderer=_ValueRenderer.from_config(self.input_config)metadata_renderer=_ValueRenderer.from_config(self.metadata_config)output_renderer=_ValueRenderer.from_config(self.output_config)score_renderers=self._infer_score_renderers(report,baseline)label_renderers=self._infer_label_renderers(report,baseline)metric_renderers=self._infer_metric_renderers(report,baseline)duration_renderer=_NumberRenderer.infer_from_config(self.duration_config,'duration',[x.task_durationforxinself._all_cases(report,baseline)])returnReportCaseRenderer(include_input=self.include_input,include_metadata=self.include_metadata,include_expected_output=self.include_expected_output,include_output=self.include_output,include_scores=self.include_scores(report,baseline),include_labels=self.include_labels(report,baseline),include_metrics=self.include_metrics(report,baseline),include_assertions=self.include_assertions(report,baseline),include_durations=self.include_durations,include_total_duration=self.include_total_duration,input_renderer=input_renderer,metadata_renderer=metadata_renderer,output_renderer=output_renderer,score_renderers=score_renderers,label_renderers=label_renderers,metric_renderers=metric_renderers,duration_renderer=duration_renderer,)defbuild_table(self,report:EvaluationReport)->Table:case_renderer=self._get_case_renderer(report)table=case_renderer.build_base_table(f'Evaluation Summary: {report.name}')forcaseinreport.cases:table.add_row(*case_renderer.build_row(case))ifself.include_averages:average=report.averages()table.add_row(*case_renderer.build_aggregate_row(average))returntabledefbuild_diff_table(self,report:EvaluationReport,baseline:EvaluationReport)->Table:report_cases=report.casesbaseline_cases=self._baseline_cases_to_include(report,baseline)report_cases_by_id={case.name:caseforcaseinreport_cases}baseline_cases_by_id={case.name:caseforcaseinbaseline_cases}diff_cases:list[tuple[ReportCase,ReportCase]]=[]removed_cases:list[ReportCase]=[]added_cases:list[ReportCase]=[]forcase_idinsorted(set(baseline_cases_by_id.keys())|set(report_cases_by_id.keys())):maybe_baseline_case=baseline_cases_by_id.get(case_id)maybe_report_case=report_cases_by_id.get(case_id)ifmaybe_baseline_caseandmaybe_report_case:diff_cases.append((maybe_baseline_case,maybe_report_case))elifmaybe_baseline_case:removed_cases.append(maybe_baseline_case)elifmaybe_report_case:added_cases.append(maybe_report_case)else:# pragma: no coverassertFalse,'This should be unreachable'case_renderer=self._get_case_renderer(report,baseline)diff_name=baseline.nameifbaseline.name==report.nameelsef'{baseline.name} → {report.name}'table=case_renderer.build_base_table(f'Evaluation Diff: {diff_name}')forbaseline_case,new_caseindiff_cases:table.add_row(*case_renderer.build_diff_row(new_case,baseline_case))forcaseinadded_cases:row=case_renderer.build_row(case)row[0]=f'[green]+ Added Case[/]\n{row[0]}'table.add_row(*row)forcaseinremoved_cases:row=case_renderer.build_row(case)row[0]=f'[red]- Removed Case[/]\n{row[0]}'table.add_row(*row)ifself.include_averages:report_average=ReportCaseAggregate.average(report_cases)baseline_average=ReportCaseAggregate.average(baseline_cases)table.add_row(*case_renderer.build_diff_aggregate_row(report_average,baseline_average))returntabledef_infer_score_renderers(self,report:EvaluationReport,baseline:EvaluationReport|None)->dict[str,_NumberRenderer]:all_cases=self._all_cases(report,baseline)values_by_name:dict[str,list[float|int]]={}forcaseinall_cases:fork,scoreincase.scores.items():values_by_name.setdefault(k,[]).append(score.value)all_renderers:dict[str,_NumberRenderer]={}forname,valuesinvalues_by_name.items():merged_config=_DEFAULT_NUMBER_CONFIG.copy()merged_config.update(self.score_configs.get(name,{}))all_renderers[name]=_NumberRenderer.infer_from_config(merged_config,'score',values)returnall_renderersdef_infer_label_renderers(self,report:EvaluationReport,baseline:EvaluationReport|None)->dict[str,_ValueRenderer]:all_cases=self._all_cases(report,baseline)all_names:set[str]=set()forcaseinall_cases:forkincase.labels:all_names.add(k)all_renderers:dict[str,_ValueRenderer]={}fornameinall_names:merged_config=_DEFAULT_VALUE_CONFIG.copy()merged_config.update(self.label_configs.get(name,{}))all_renderers[name]=_ValueRenderer.from_config(merged_config)returnall_renderersdef_infer_metric_renderers(self,report:EvaluationReport,baseline:EvaluationReport|None)->dict[str,_NumberRenderer]:all_cases=self._all_cases(report,baseline)values_by_name:dict[str,list[float|int]]={}forcaseinall_cases:fork,vincase.metrics.items():values_by_name.setdefault(k,[]).append(v)all_renderers:dict[str,_NumberRenderer]={}forname,valuesinvalues_by_name.items():merged_config=_DEFAULT_NUMBER_CONFIG.copy()merged_config.update(self.metric_configs.get(name,{}))all_renderers[name]=_NumberRenderer.infer_from_config(merged_config,'metric',values)returnall_renderersdef_infer_duration_renderer(self,report:EvaluationReport,baseline:EvaluationReport|None)->_NumberRenderer:# pragma: no coverall_cases=self._all_cases(report,baseline)all_durations=[x.task_durationforxinall_cases]ifself.include_total_duration:all_durations+=[x.total_durationforxinall_cases]return_NumberRenderer.infer_from_config(self.duration_config,'duration',all_durations)