evals#
- class AnthropicModel(default_concurrency: int = 20, _verbose: bool = False, _rate_limiter: phoenix.evals.models.rate_limiters.RateLimiter = <factory>, model: str = 'claude-2.1', temperature: float = 0.0, max_tokens: int = 256, top_p: float = 1, top_k: int = 256, stop_sequences: List[str] = <factory>, extra_parameters: Dict[str, Any] = <factory>, max_content_size: Optional[int] = None)#
Bases:
BaseModel
- extra_parameters: Dict[str, Any]#
Any extra parameters to add to the request body (e.g., countPenalty for a21 models)
- invocation_parameters() Dict[str, Any] #
- max_content_size: int | None = None#
If you’re using a fine-tuned model, set this to the maximum content size
- max_tokens: int = 256#
The maximum number of tokens to generate in the completion.
- model: str = 'claude-2.1'#
The model name to use.
- stop_sequences: List[str]#
If the model encounters a stop sequence, it stops generating further tokens.
- temperature: float = 0.0#
What sampling temperature to use.
- top_k: int = 256#
The cutoff where the model no longer selects the words.
- top_p: float = 1#
Total probability mass of tokens to consider at each step.
- class BedrockModel(default_concurrency: int = 20, _verbose: bool = False, _rate_limiter: phoenix.evals.models.rate_limiters.RateLimiter = <factory>, model_id: str = 'anthropic.claude-v2', temperature: float = 0.0, max_tokens: int = 256, top_p: float = 1, top_k: int = 256, stop_sequences: List[str] = <factory>, session: Any = None, client: Any = None, max_content_size: Optional[int] = None, extra_parameters: Dict[str, Any] = <factory>)#
Bases:
BaseModel
- client: Any = None#
The bedrock session client. If unset, a new one is created with boto3.
- extra_parameters: Dict[str, Any]#
Any extra parameters to add to the request body (e.g., countPenalty for a21 models)
- max_content_size: int | None = None#
If you’re using a fine-tuned model, set this to the maximum content size
- max_tokens: int = 256#
The maximum number of tokens to generate in the completion.
- model_id: str = 'anthropic.claude-v2'#
The model name to use.
- session: Any = None#
A bedrock session. If provided, a new bedrock client will be created using this session.
- stop_sequences: List[str]#
If the model encounters a stop sequence, it stops generating further tokens.
- temperature: float = 0.0#
What sampling temperature to use.
- top_k: int = 256#
The cutoff where the model no longer selects the words
- top_p: float = 1#
Total probability mass of tokens to consider at each step.
- class ClassificationTemplate(rails: List[str], template: str, explanation_template: str | None = None, explanation_label_parser: Callable[[str], str] | None = None, delimiters: Tuple[str, str] = ('{', '}'), scores: List[float] | None = None)#
Bases:
PromptTemplate
- extract_label_from_explanation(raw_string: str) str #
- prompt(options: PromptOptions | None = None) str #
- score(rail: str) float #
- class GeminiModel(default_concurrency: int = 5, _verbose: bool = False, _rate_limiter: phoenix.evals.models.rate_limiters.RateLimiter = <factory>, project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[ForwardRef('Credentials')] = None, model: str = 'gemini-pro', temperature: float = 0.0, max_tokens: int = 256, top_p: float = 1, top_k: int = 32, stop_sequences: List[str] = <factory>)#
Bases:
BaseModel
- credentials: Credentials | None = None#
- default_concurrency: int = 5#
- property generation_config: Dict[str, Any]#
- location: str | None = None#
The default location to use when making API calls. If not
- Type:
location (str)
- max_tokens: int = 256#
The maximum number of tokens to generate in the completion.
- model: str = 'gemini-pro'#
The model name to use.
- project: str | None = None#
The default project to use when making API calls.
- Type:
project (str)
- reload_client() None #
- stop_sequences: List[str]#
If the model encounters a stop sequence, it stops generating further tokens.
- temperature: float = 0.0#
What sampling temperature to use.
- top_k: int = 32#
The cutoff where the model no longer selects the words
- top_p: float = 1#
Total probability mass of tokens to consider at each step.
- class HallucinationEvaluator(model: BaseModel)#
Bases:
LLMEvaluator
Leverages an LLM to evaluate whether a response (stored under an “output” column) is a hallucination given a query (stored under an “input” column) and one or more retrieved documents (stored under a “reference” column).
- class LLMEvaluator(model: BaseModel, template: ClassificationTemplate)#
Bases:
object
Leverages an LLM to evaluate individual records.
- async aevaluate(record: Mapping[str, str], provide_explanation: bool = False, use_function_calling_if_available: bool = True, verbose: bool = False) Tuple[str, float | None, str | None] #
Evaluates a single record.
- Parameters:
record (Record) – The record to evaluate.
provide_explanation (bool, optional) – Whether to provide an
explanation.
use_function_calling_if_available (bool, optional) – If True, use
calling (outputs. With function)
calling
its (the LLM is instructed to provide)
object (response as a structured JSON)
parse. (which is easier to)
verbose (bool, optional) – Whether to print verbose output.
- Returns:
A tuple containing: - label - score (if scores for each label are specified by the template) - explanation (if requested)
- Return type:
Tuple[str, Optional[float], Optional[str]]
- property default_concurrency: int#
- evaluate(record: Mapping[str, str], provide_explanation: bool = False, use_function_calling_if_available: bool = True, verbose: bool = False) Tuple[str, float | None, str | None] #
Evaluates a single record.
- Parameters:
record (Record) – The record to evaluate.
provide_explanation (bool, optional) – Whether to provide an
explanation.
use_function_calling_if_available (bool, optional) – If True, use
calling (outputs. With function)
calling
its (the LLM is instructed to provide)
object (response as a structured JSON)
parse. (which is easier to)
use_function_calling_if_available – If True, use
calling
calling
its
object
parse.
verbose (bool, optional) – Whether to print verbose output.
- Returns:
A tuple containing: - label - score (if scores for each label are specified by the template) - explanation (if requested)
- Return type:
Tuple[str, Optional[float], Optional[str]]
- reload_client() None #
- class LiteLLMModel(default_concurrency: int = 20, _verbose: bool = False, _rate_limiter: phoenix.evals.models.rate_limiters.RateLimiter = <factory>, model: str = 'gpt-3.5-turbo', temperature: float = 0.0, max_tokens: int = 256, top_p: float = 1, num_retries: int = 0, request_timeout: int = 60, model_kwargs: Dict[str, Any] = <factory>, model_name: Optional[str] = None)#
Bases:
BaseModel
- max_tokens: int = 256#
The maximum number of tokens to generate in the completion.
- model: str = 'gpt-3.5-turbo'#
The model name to use.
- model_kwargs: Dict[str, Any]#
Model specific params
- model_name: str | None = None#
Deprecated since version 3.0.0.
use model instead. This will be removed in a future release.
- num_retries: int = 0#
Maximum number to retry a model if an RateLimitError, OpenAIError, or ServiceUnavailableError occurs.
- request_timeout: int = 60#
Maximum number of seconds to wait when retrying.
- temperature: float = 0.0#
What sampling temperature to use.
- top_p: float = 1#
Total probability mass of tokens to consider at each step.
- class MistralAIModel(default_concurrency: int = 20, _verbose: bool = False, _rate_limiter: ~phoenix.evals.models.rate_limiters.RateLimiter = <factory>, model: str = 'mistral-large-latest', temperature: float = 0, top_p: float | None = None, random_seed: int | None = None, response_format: ~typing.Dict[str, str] | None = None, safe_mode: bool = False, safe_prompt: bool = False)#
Bases:
BaseModel
A model class for Mistral AI. Requires mistralai package to be installed.
- invocation_parameters() Dict[str, Any] #
- model: str = 'mistral-large-latest'#
- random_seed: int | None = None#
- response_format: Dict[str, str] | None = None#
- safe_mode: bool = False#
- safe_prompt: bool = False#
- temperature: float = 0#
- top_p: float | None = None#
- class OpenAIModel(default_concurrency: int = 20, _verbose: bool = False, _rate_limiter: phoenix.evals.models.rate_limiters.RateLimiter = <factory>, api_key: Optional[str] = None, organization: Optional[str] = None, base_url: Optional[str] = None, model: str = 'gpt-4', temperature: float = 0.0, max_tokens: int = 256, top_p: float = 1, frequency_penalty: float = 0, presence_penalty: float = 0, n: int = 1, model_kwargs: Dict[str, Any] = <factory>, batch_size: int = 20, request_timeout: Union[float, Tuple[float, float], NoneType] = None, api_version: Optional[str] = None, azure_endpoint: Optional[str] = None, azure_deployment: Optional[str] = None, azure_ad_token: Optional[str] = None, azure_ad_token_provider: Optional[Callable[[], str]] = None, default_headers: Optional[Mapping[str, str]] = None, model_name: Optional[str] = None)#
Bases:
BaseModel
- api_key: str | None = None#
Your OpenAI key. If not provided, will be read from the environment variable
- api_version: str | None = None#
//learn.microsoft.com/en-us/azure/ai-services/openai/reference#rest-api-versioning
- Type:
https
- azure_ad_token: str | None = None#
- azure_ad_token_provider: Callable[[], str] | None = None#
- azure_deployment: str | None = None#
- azure_endpoint: str | None = None#
The endpoint to use for azure openai. Available in the azure portal. https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource
- base_url: str | None = None#
An optional base URL to use for the OpenAI API. If not provided, will default to what’s configured in OpenAI
- batch_size: int = 20#
Batch size to use when passing multiple documents to generate.
- default_headers: Mapping[str, str] | None = None#
Default headers required by AzureOpenAI
- frequency_penalty: float = 0#
Penalizes repeated tokens according to frequency.
- property invocation_params: Dict[str, Any]#
- max_tokens: int = 256#
The maximum number of tokens to generate in the completion. -1 returns as many tokens as possible given the prompt and the models maximal context size.
- model: str = 'gpt-4'#
Model name to use. In of azure, this is the deployment name such as gpt-35-instant
- model_kwargs: Dict[str, Any]#
Holds any model parameters valid for create call not explicitly specified.
- model_name: str | None = None#
Deprecated since version 3.0.0.
use model instead. This will be removed
- n: int = 1#
How many completions to generate for each prompt.
- organization: str | None = None#
The organization to use for the OpenAI API. If not provided, will default to what’s configured in OpenAI
- presence_penalty: float = 0#
Penalizes repeated tokens.
- property public_invocation_params: Dict[str, Any]#
- reload_client() None #
- request_timeout: float | Tuple[float, float] | None = None#
Timeout for requests to OpenAI completion API. Default is 600 seconds.
- property supports_function_calling: bool#
- temperature: float = 0.0#
What sampling temperature to use.
- top_p: float = 1#
Total probability mass of tokens to consider at each step.
- verbose_generation_info() str #
- class PromptTemplate(template: str, delimiters: Tuple[str, str] = ('{', '}'))#
Bases:
object
- format(variable_values: Mapping[str, bool | int | float | str], options: PromptOptions | None = None) str #
- prompt(options: PromptOptions | None = None) str #
- template: str#
- variables: List[str]#
- class QAEvaluator(model: BaseModel)#
Bases:
LLMEvaluator
Leverages an LLM to evaluate whether a response (stored under an “output” column) is correct or incorrect given a query (stored under an “input” column) and one or more retrieved documents (stored under a “reference” column).
- class RelevanceEvaluator(model: BaseModel)#
Bases:
LLMEvaluator
Leverages an LLM to evaluate whether a retrieved document (stored under a “reference” column) is relevant or irrelevant to the corresponding query (stored under the “input” column).
- class SQLEvaluator(model: BaseModel)#
Bases:
LLMEvaluator
Leverages an LLM to evaluate whether a generated SQL query (stored under the “query_gen” column) and a response (stored under the “response” column) appropriately answer a question (stored under the “question” column).
- class SummarizationEvaluator(model: BaseModel)#
Bases:
LLMEvaluator
Leverages an LLM to evaluate whether a summary (stored under an “output” column) provides an accurate synopsis of an input document (stored under a “input” column).
- class ToxicityEvaluator(model: BaseModel)#
Bases:
LLMEvaluator
Leverages an LLM to evaluate whether the string stored under the “input” column contains racist, sexist, chauvinistic, biased, or otherwise toxic content.
- class VertexAIModel(default_concurrency: int = 20, _verbose: bool = False, _rate_limiter: phoenix.evals.models.rate_limiters.RateLimiter = <factory>, project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[ForwardRef('Credentials')] = None, model: str = 'text-bison', tuned_model: Optional[str] = None, temperature: float = 0.0, max_tokens: int = 256, top_p: float = 0.95, top_k: int = 40, model_name: Optional[str] = None, tuned_model_name: Optional[str] = None)#
Bases:
BaseModel
- credentials: Credentials | None = None#
- property invocation_params: Dict[str, Any]#
- property is_codey_model: bool#
- location: str | None = None#
The default location to use when making API calls. If not
- Type:
location (str)
- max_tokens: int = 256#
The maximum number of tokens to generate in the completion. -1 returns as many tokens as possible given the prompt and the models maximal context size.
- model: str = 'text-bison'#
- model_name: str | None = None#
Deprecated since version 3.0.0.
use model instead. This will be removed in a future release.
- project: str | None = None#
The default project to use when making API calls.
- Type:
project (str)
- temperature: float = 0.0#
What sampling temperature to use.
- top_k: int = 40#
How the model selects tokens for output, the next token is selected from
- top_p: float = 0.95#
Tokens are selected from most probable to least until the sum of their
- tuned_model: str | None = None#
The name of a tuned model. If provided, model is ignored.
- tuned_model_name: str | None = None#
Deprecated since version 3.0.0.
use tuned_model instead. This will be removed in a future release.
- verbose_generation_info() str #
- compute_precisions_at_k(relevance_classifications: List[bool | None]) List[float | None] #
Given a list of relevance classifications, computes precision@k for k = 1, 2, …, n, where n is the length of the input list.
- Parameters:
relevance_classifications (List[Optional[bool]]) – A list of relevance classifications for a set of retrieved documents, sorted by order of retrieval (i.e., the first element is the classification for the first retrieved document, the second element is the classification for the second retrieved document, etc.). The list may contain None values, which indicate that the relevance classification for the corresponding document is unknown.
- Returns:
- A list of precision@k values for k = 1, 2, …, n, where n is the
length of the input list. The first element is the precision@1 value, the second element is the precision@2 value, etc. If the input list contains any None values, those values are omitted when computing the precision@k values.
- Return type:
List[Optional[float]]
- download_benchmark_dataset(task: str, dataset_name: str) DataFrame #
Downloads an Arize evals benchmark dataset as a pandas dataframe.
- Parameters:
task (str) – Task to be performed.
dataset_name (str) – Name of the dataset.
- Returns:
A pandas dataframe containing the data.
- Return type:
pandas.DataFrame
- llm_classify(dataframe: DataFrame, model: BaseModel, template: ClassificationTemplate | PromptTemplate | str, rails: List[str], system_instruction: str | None = None, verbose: bool = False, use_function_calling_if_available: bool = True, provide_explanation: bool = False, include_prompt: bool = False, include_response: bool = False, include_exceptions: bool = False, max_retries: int = 10, exit_on_error: bool = True, run_sync: bool = False, concurrency: int | None = None) DataFrame #
Classifies each input row of the dataframe using an LLM. Returns a pandas.DataFrame where the first column is named label and contains the classification labels. An optional column named explanation is added when provide_explanation=True.
- Parameters:
dataframe (pandas.DataFrame) – A pandas dataframe in which each row represents a record to be
(extra (classified. All template variable names must appear as column names in the dataframe)
permitted). (columns unrelated to the template are)
template (Union[ClassificationTemplate, PromptTemplate, str]) – The prompt template as
PromptTemplate (either an instance of)
string (ClassificationTemplate or a string. If a)
the
made (variable names should be surrounded by curly braces so that a call to .format can be)
values. (to substitute variable)
model (BaseEvalModel) – An LLM model class.
rails (List[str]) – A list of strings representing the possible output classes of the model’s
predictions.
system_instruction (Optional[str], optional) – An optional system message.
verbose (bool, optional) – If True, prints detailed info to stdout such as model invocation
False. (parameters and details about retries and snapping to rails. Default)
use_function_calling_if_available (bool, default=True) – If True, use function calling
calling ((if available) as a means to constrain the LLM outputs. With function)
LLM (the)
object (is instructed to provide its response as a structured JSON)
easier (which is)
parse. (to)
provide_explanation (bool, default=False) – If True, provides an explanation for each
dataframe. (classification label. A column named explanation is added to the output)
include_prompt (bool, default=False) – If True, includes a column named prompt in the
classification. (output dataframe containing the prompt used for each)
include_response (bool, default=False) – If True, includes a column named response in the
LLM. (output dataframe containing the raw response from the)
max_retries (int, optional) – The maximum number of times to retry on exceptions. Defaults to
10.
exit_on_error (bool, default=True) – If True, stops processing evals after all retries are
False (exhausted on a single eval attempt. If)
returning (all evals are attempted before)
:param : :param even if some fail.: :param run_sync: If True, forces synchronous request submission. Otherwise :type run_sync: bool, default=False :param evaluations will be run asynchronously if possible.: :param concurrency: The number of concurrent evals if async :type concurrency: Optional[int], default=None :param submission is possible. If not provided: :param a recommended default concurrency is set on a: :param per-model basis.:
- Returns:
A dataframe where the label column (at column position 0) contains the classification labels. If provide_explanation=True, then an additional column named explanation is added to contain the explanation for each label. The dataframe has the same length and index as the input dataframe. The classification label values are from the entries in the rails argument or “NOT_PARSABLE” if the model’s output could not be parsed. The output dataframe also includes three additional columns in the output dataframe: exceptions, execution_status, and execution_seconds containing details about execution errors that may have occurred during the classification as well as the total runtime of each classification (in seconds).
- Return type:
pandas.DataFrame
- llm_generate(dataframe: DataFrame, template: PromptTemplate | str, model: BaseModel, system_instruction: str | None = None, verbose: bool = False, output_parser: Callable[[str, int], Dict[str, Any]] | None = None, include_prompt: bool = False, include_response: bool = False, run_sync: bool = False, concurrency: int | None = None) DataFrame #
Generates a text using a template using an LLM. This function is useful if you want to generate synthetic data, such as irrelevant responses :param dataframe: A pandas dataframe in which each row :type dataframe: pandas.DataFrame :param represents a record to be used as in input to the template. All: :param template variable names must appear as column names in the dataframe: :param (extra columns unrelated to the template are permitted).: :param template: The prompt template as either an :type template: Union[PromptTemplate, str] :param instance of PromptTemplate or a string. If the latter: :param the variable: :param names should be surrounded by curly braces so that a call to .format: :param can be made to substitute variable values.: :param model: An LLM model class. :type model: BaseEvalModel :param system_instruction: An optional system :type system_instruction: Optional[str], optional :param message.: :param verbose: If True, prints detailed information to stdout such as model :type verbose: bool, optional :param invocation parameters and retry info. Default False.: :param output_parser: An optional function :type output_parser: Callable[[str, int], Dict[str, Any]], optional :param that takes each generated response and response index and parses it to a dictionary. The: :param keys of the dictionary should correspond to the column names of the output dataframe. If: :param None: :param the output dataframe will have a single column named “output”. Default None.: :param include_prompt: If True, includes a column named prompt in the :type include_prompt: bool, default=False :param output dataframe containing the prompt used for each generation.: :param include_response: If True, includes a column named response in the :type include_response: bool, default=False :param output dataframe containing the raw response from the LLM prior to applying the output: :param parser.: :param run_sync: If True, forces synchronous request submission. Otherwise :type run_sync: bool, default=False :param evaluations will be run asynchronously if possible.: :param concurrency: The number of concurrent evals if async :type concurrency: Optional[int], default=None :param submission is possible. If not provided: :param a recommended default concurrency is set on a: :param per-model basis.:
- Returns:
A dataframe where each row represents the generated output
- Return type:
generations_dataframe (pandas.DataFrame)
- run_evals(dataframe: DataFrame, evaluators: List[LLMEvaluator], provide_explanation: bool = False, use_function_calling_if_available: bool = True, verbose: bool = False, concurrency: int | None = None) List[DataFrame] #
Applies a list of evaluators to a dataframe. Outputs a list of dataframes in which each dataframe contains the outputs of the corresponding evaluator applied to the input dataframe.
- Parameters:
dataframe (DataFrame) – A pandas dataframe in which each row represents a
as (the LLM is instructed to provide its response)
template (column names in the dataframe (extra columns unrelated to the)
permitted). (are)
evaluators (List[LLMEvaluator]) – A list of evaluators.
provide_explanation (bool, optional) – If True, provides an explanation
each (for each evaluation. A column named "explanation" is added to)
dataframe. (output)
use_function_calling_if_available (bool, optional) – If True, use
calling (With function)
calling
as
object (a structured JSON)
parse. (which is easier to)
verbose (bool, optional) – If True, prints detailed info to stdout such
to (as model invocation parameters and details about retries and snapping)
rails.
concurrency (Optional[int], default=None) – The number of concurrent evals if async
provided (submission is possible. If not)
a (a recommended default concurrency is set on)
basis. (per-model)
- Returns:
A list of dataframes, one for each evaluator, all of which have the same number of rows as the input dataframe.
- Return type:
List[DataFrame]
- evals.models
AnthropicModel
BaseModel
BedrockModel
GeminiModel
LiteLLMModel
MistralAIModel
OpenAIModel
OpenAIModel.api_key
OpenAIModel.api_version
OpenAIModel.azure_ad_token
OpenAIModel.azure_ad_token_provider
OpenAIModel.azure_deployment
OpenAIModel.azure_endpoint
OpenAIModel.base_url
OpenAIModel.batch_size
OpenAIModel.default_headers
OpenAIModel.frequency_penalty
OpenAIModel.invocation_params
OpenAIModel.max_tokens
OpenAIModel.model
OpenAIModel.model_kwargs
OpenAIModel.model_name
OpenAIModel.n
OpenAIModel.organization
OpenAIModel.presence_penalty
OpenAIModel.public_invocation_params
OpenAIModel.reload_client()
OpenAIModel.request_timeout
OpenAIModel.supports_function_calling
OpenAIModel.temperature
OpenAIModel.top_p
OpenAIModel.verbose_generation_info()
VertexAIModel
VertexAIModel.credentials
VertexAIModel.invocation_params
VertexAIModel.is_codey_model
VertexAIModel.location
VertexAIModel.max_tokens
VertexAIModel.model
VertexAIModel.model_name
VertexAIModel.project
VertexAIModel.temperature
VertexAIModel.top_k
VertexAIModel.top_p
VertexAIModel.tuned_model
VertexAIModel.tuned_model_name
VertexAIModel.verbose_generation_info()
set_verbosity()
- evals.models.anthropic
- evals.models.base
- evals.models.bedrock
- evals.models.litellm
- evals.models.mistralai
- evals.models.openai
AzureOptions
OpenAIModel
OpenAIModel.api_key
OpenAIModel.api_version
OpenAIModel.azure_ad_token
OpenAIModel.azure_ad_token_provider
OpenAIModel.azure_deployment
OpenAIModel.azure_endpoint
OpenAIModel.base_url
OpenAIModel.batch_size
OpenAIModel.default_headers
OpenAIModel.frequency_penalty
OpenAIModel.invocation_params
OpenAIModel.max_tokens
OpenAIModel.model
OpenAIModel.model_kwargs
OpenAIModel.model_name
OpenAIModel.n
OpenAIModel.organization
OpenAIModel.presence_penalty
OpenAIModel.public_invocation_params
OpenAIModel.reload_client()
OpenAIModel.request_timeout
OpenAIModel.supports_function_calling
OpenAIModel.temperature
OpenAIModel.top_p
OpenAIModel.verbose_generation_info()
- evals.models.rate_limiters
- evals.models.vertex
- evals.models.vertexai
VertexAIModel
VertexAIModel.credentials
VertexAIModel.invocation_params
VertexAIModel.is_codey_model
VertexAIModel.location
VertexAIModel.max_tokens
VertexAIModel.model
VertexAIModel.model_name
VertexAIModel.project
VertexAIModel.temperature
VertexAIModel.top_k
VertexAIModel.top_p
VertexAIModel.tuned_model
VertexAIModel.tuned_model_name
VertexAIModel.verbose_generation_info()
is_codey_model()
- evals.classify
- evals.default_templates
EvalCriteria
EvalCriteria.CODE_FUNCTIONALITY
EvalCriteria.CODE_READABILITY
EvalCriteria.HALLUCINATION
EvalCriteria.HALLUCINATION_SPAN_LEVEL
EvalCriteria.HUMAN_VS_AI
EvalCriteria.QA
EvalCriteria.QA_SPAN_LEVEL
EvalCriteria.REFERENCE_LINK_CORRECTNESS
EvalCriteria.RELEVANCE
EvalCriteria.SQL_GEN_EVAL
EvalCriteria.SUMMARIZATION
EvalCriteria.TOXICITY
EvalCriteria.USER_FRUSTRATION
- evals.evaluators
- evals.exceptions
- evals.executors
- evals.generate
- evals.retrievals
- evals.span_templates
- evals.templates
- evals.utils