Skip to main content

Evaluation SDK

Install the lastmile-eval library from PyPI to get started.

Metrics Library

get_traces

def get_trace(
trace_id: str,
lastmile_api_token: Optional[str] = None,
) -> core.JSONObject:
"""
Download an individual trace.
def get_traces(
project_id: Optional[str],
trace_ids: Optional[str | List[str]] = None,
take: Optional[int] = None,
search_filter: Optional[str] = None,
start_time: Optional[str] = None,
end_time: Optional[str] = None,
creator_id: Optional[str] = None,
organization_id: Optional[str] = None,
lastmile_api_token: Optional[str] = None,
) -> Generator[pd.DataFrame, None, None]:
"""
Download traces as a DataFrame.

Args:
trace_ids: Optional filter by IDs
take: Number of traces to download per request. The maximum is 50.
search_filter: A substring search to match any property in the trace metadata.
start_time: Start timestamp to filter traces >= start_time.
end_time: End timestamp to filter traces <= end_time.
creator_id: Filter for traces logged by a particular user account.
organization_id: Filter for traces logged under a particular organization.
project_id: The id of the project the traces were logged to.
If not provided, will read from the LASTMILE_PROJECT_ID environment variable.
lastmile_api_token: The API token for the LastMile API. If not provided,
will try to get the token from the LASTMILE_API_TOKEN
environment variable.
You can create a token from the "API Tokens" section from this website:
{WEBSITE_BASE_URL}/settings?page=tokens

Returns:
A DataFrame containing the trace data.
"""

Current API:

def download_input_traces(
project_id: Optional[str],
trace_id: Optional[str] = None,
batch_limit: Optional[int] = None,
substring_filter: Optional[str] = None,
creator_id: Optional[str] = None,
organization_id: Optional[str] = None,
start_time: Optional[str] = None,
end_time: Optional[str] = None,
lastmile_api_token: Optional[str] = None,
) -> Generator[pd.DataFrame, None, None]:

get_events

def get_event(
event_id: str,
lastmile_api_token: Optional[str] = None,
) -> core.JSONObject:
"""
Download an individual RAGEvent.
def get_events(
project_id: Optional[str],
event_ids: Optional[str | List[str]] = None,
take: Optional[int] = None,
event_kind: Optional[str] = None,
search_filter: Optional[str] = None,
start_time: Optional[str] = None,
end_time: Optional[str] = None,
creator_id: Optional[str] = None,
organization_id: Optional[str] = None,
lastmile_api_token: Optional[str] = None,
) -> Generator[pd.DataFrame, None, None]:
"""
Download RAGEvents as a DataFrame.

Args:
event_ids: Optional filter by IDs
take: Number of events to download per request. The maximum is 50.
event_kind: The kind of the event to filter by. <TODO: add link to valid event kinds, which are implemented in add_rag_event_interface.py>.
search_filter: A substring search to match any property in the trace metadata.
start_time: Start timestamp to filter traces >= start_time.
end_time: End timestamp to filter traces <= end_time.
creator_id: Filter for traces logged by a particular user account.
organization_id: Filter for traces logged under a particular organization.
project_id: The id of the project the traces were logged to.
If not provided, will read from the LASTMILE_PROJECT_ID environment variable.
lastmile_api_token: The API token for the LastMile API. If not provided,
will try to get the token from the LASTMILE_API_TOKEN
environment variable.
You can create a token from the "API Tokens" section from this website:
{WEBSITE_BASE_URL}/settings?page=tokens

Returns:
A DataFrame containing the RAG event data.
"""

Current API:

def download_rag_events(
project_id: Optional[str],
batch_limit: Optional[int] = None,
substring_filter: Optional[str] = None,
creator_id: Optional[str] = None,
organization_id: Optional[str] = None,
start_time: Optional[str] = None,
end_time: Optional[str] = None,
event_name: Optional[str] = None,
lastmile_api_token: Optional[str] = None,
) -> Generator[pd.DataFrame, None, None]:

Test Dataset APIs

def create_test_dataset(
name: str,
data: pd.DataFrame,
tags: Optional[List[str]] = None,
project_id: Optional[str],
lastmile_api_token: Optional[str] = None,
) -> core.TestSetID:
"""
Create a Test Set from the given data.

name: Name to save the Test Set as.
data: A DataFrame that should contain up to three columns: 'input', 'output', 'groundTruth'. The input column is *required* for every TestSet.
tags: Optional tags to categorize the TestSet as. This can be used for filtering.
project_id: The id of the project to save TestSet in.
If not provided, will read from the LASTMILE_PROJECT_ID environment variable.
lastmile_api_token: The API token for the LastMile API. If not provided,
will try to get the token from the LASTMILE_API_TOKEN
environment variable.
You can create a token from the "API Tokens" section from this website:
https://lastmileai.dev/settings?page=tokens
"""
def create_test_from_trace_data(
name: str,
trace_ids: Optional[str | List[str]] = None,
take: Optional[int] = None,
sampling_rate: Optional[float] = None,
search_filter: Optional[str] = None,
start_time: Optional[str] = None,
end_time: Optional[str] = None,
tags: Optional[List[str]] = None,
project_id: Optional[str],
lastmile_api_token: Optional[str] = None,
) -> core.TestSetID:
"""
Create a Test Set from the given data.

name: Name to save the Test Set as.
trace_ids: Optional filter by IDs
take: Number of traces to download per request. The maximum is 50.
sampling_rate: Sampling rate (between 0 and 1) to take a random sampling of the traces that match the criteria.
If unspecified, the first `take` traces matching the criteria will be returned.
search_filter: A substring search to match any property in the trace metadata.
start_time: Start timestamp to filter traces >= start_time.
end_time: End timestamp to filter traces <= end_time.
tags: Optional tags to categorize the TestSet as. This can be used for filtering.
project_id: The id of the project to save the TestSet in.
If not provided, will read from the LASTMILE_PROJECT_ID environment variable.
lastmile_api_token: The API token for the LastMile API. If not provided,
will try to get the token from the LASTMILE_API_TOKEN
environment variable.
You can create a token from the "API Tokens" section from this website:
https://lastmileai.dev/settings?page=tokens

"""
def create_test_from_event_data(
name: str,
event_kind: str,
event_ids: Optional[str | List[str]] = None,
take: Optional[int] = None,
sampling_rate: Optional[float] = None,
search_filter: Optional[str] = None,
start_time: Optional[str] = None,
end_time: Optional[str] = None,
tags: Optional[List[str]] = None,
project_id: Optional[str],
lastmile_api_token: Optional[str] = None,
) -> core.TestSetID:
"""
Create a Test Set from the given data.

name: Name to save the Test Set as.
event_kind: The kind of the event to filter by.
<TODO: add link to valid event kinds, which are implemented in add_rag_event_interface.py>.
event_ids: Optional filter by IDs
take: Number of events to download per request. The maximum is 50.
sampling_rate: Sampling rate (between 0 and 1) to take a random sampling of the traces that match the criteria.
If unspecified, the first `take` traces matching the criteria will be returned.
search_filter: A substring search to match any property in the trace metadata.
start_time: Start timestamp to filter traces >= start_time.
end_time: End timestamp to filter traces <= end_time.
tags: Optional tags to categorize the TestSet as. This can be used for filtering.
project_id: The id of the project to save the TestSet in.
If not provided, will read from the LASTMILE_PROJECT_ID environment variable.
lastmile_api_token: The API token for the LastMile API. If not provided,
will try to get the token from the LASTMILE_API_TOKEN
environment variable.
You can create a token from the "API Tokens" section from this website:
https://lastmileai.dev/settings?page=tokens
"""
def add_test_case(
test_set_id: str,
data: pd.DataFrame,
lastmile_api_token: Optional[str] = None,
) -> core.TestSetID:
"""
Update a Test Set with additional test cases from the given data.

test_set_id: ID of the Test Set to add test cases to.
data: A DataFrame that should contain up to three columns: 'input', 'output', 'groundTruth'. The input column is *required* for every TestSet.
lastmile_api_token: The API token for the LastMile API. If not provided,
will try to get the token from the LASTMILE_API_TOKEN
environment variable.
You can create a token from the "API Tokens" section from this website:
https://lastmileai.dev/settings?page=tokens
"""
def get_test_dataset(
test_set_id: Optional[str],
test_set_name: Optional[str],
project_id: Optional[str],
lastmile_api_token: Optional[str] = None,
) -> pd.DataFrame:
"""
Downloads the Test Set data containing input, and optionally output and groundTruth.

test_set_id: ID of the Test Set to download.
test_set_name: Name of the Test Set to download.
If `test_set_id` is unspecified, then `test_set_name` and `project_id` must be provided.
project_id: The id of the project the TestSet is in.
If not provided, will read from the LASTMILE_PROJECT_ID environment variable.
lastmile_api_token: The API token for the LastMile API. If not provided,
will try to get the token from the LASTMILE_API_TOKEN
environment variable.
You can create a token from the "API Tokens" section from this website:
https://lastmileai.dev/settings?page=tokens
"""
def delete_test_dataset(
id: str,
lastmile_api_token: Optional[str] = None,
) -> bool:
"""
Delete a Test Set with the given ID.

id: ID of the Test Set to delete.
lastmile_api_token: The API token for the LastMile API. If not provided,
will try to get the token from the LASTMILE_API_TOKEN
environment variable.
You can create a token from the "API Tokens" section from this website:
https://lastmileai.dev/settings?page=tokens

"""

Current API:

def create_input_set(
queries: Sequence[str] | pd.DataFrame,
input_set_name: Optional[str] = None,
ground_truths: Optional[list[str]] = None,
lastmile_api_token: Optional[str] = None,
) -> evaluation_lib.CreateInputSetResponse:
"""
Create an Input set from a list of strings.

lastmile_api_token: The API token for the LastMile API. If not provided,
will try to get the token from the LASTMILE_API_TOKEN
environment variable.
You can create a token from the "API Tokens" section from this website:
https://lastmileai.dev/settings?page=tokens

"""

def download_input_set(
input_set_id: Optional[str] = None,
input_set_name: Optional[str] = None,
lastmile_api_token: Optional[str] = None,
) -> pd.DataFrame:

def create_example_set(
df: pd.DataFrame,
example_set_name: Optional[str],
ground_truths: Optional[list[str]] = None,
lastmile_api_token: Optional[str] = None,
) -> evaluation_lib.CreateExampleSetResponse:

def download_example_set(
example_set_id: Optional[str] = None,
example_set_name: Optional[str] = None,
lastmile_api_token: Optional[str] = None,
) -> pd.DataFrame:

Evaluation APIs


# Types

# Given a DataFrame, computes metrics for each row
Evaluator = Callable[
[pd.DataFrame],
list[float],
]

# Given a DataFrame, computes aggregate metrics across the entire dataset.
# Meant for things like computing precision@recall
AggregatedEvaluator = Callable[
[pd.DataFrame],
float
]

# Given a list of metrics, aggregates them into a single metric (e.g. mean)
Aggregator = Optional[Callable[[List[float]], float]]
class EvaluatorTuple(NamedTuple):
evaluator: Evaluator
aggregator: Aggregator

@dataclass(frozen=True)
class SaveOptions:
# Save the results in the DB. If false, results will just be returned but not persisted anywhere.
save: bool = True
# Saves the evaluation result, as well as the Test Set (if it provided as a dataframe), using this name
name: Optional[str] = None

@dataclass
class EvaluationResult:
evaluation_result_id: Optional[core.EvaluationResultId]
test_dataset_id: Optional[core.ExampleSetID]
evaluator_metrics_df: pd.DataFrame,
aggregated_metrics: Optional[Mapping[str, float]],

# Functions

def evaluate(
test_dataset_id: Optional[str] = None,
test_dataset: Optional[pd.DataFrame] = None,
evaluators:
Mapping[
str, # Name of the evaluation metric
Evaluator | EvaluatorTuple, # Tuple of Evaluator function and (optionally) Aggregator function
]
| set[str],
aggregated_evaluators: Optional[
Mapping[
str,
AggregatedEvaluator,
]
] = None,
save_options: Optional[SaveOptions] = None,
project_id: Optional[str],
lastmile_api_token: Optional[str] = None,
) -> EvaluationResult:
"""
Run evaluations on the provided data using chosen evaluation functions.

test_dataset_id: (Must be specified if `test_dataset` isn't provided)
The id of the Test Set to run the evaluations on.
test_dataset: (If `test_dataset_id` isn't specified)
A DataFrame that should contain 'input', 'output' and optionally 'groundTruth' columns to run the evaluations on.
NOTE: Some evaluators may need additional columns, such as 'context' (for hallucination eval, for example).

evaluators: A mapping of evaluator names to evaluator functions.
Each evaluator takes a DataFrame and runs the evaluators per row.
Optionally, you can also specify an Aggregator function to zip the evaluation metrics into a single value (e.g. mean, median, and so forth).

Aggregated evaluators allow you to do custom aggregations over all the DataFrame rows (for example, some specific recall@precision).

aggregated_evaluators: Like evaluators, but these functions take a DataFrame and produce a single value that aggregates over the entire dataset.
NOTE: This should only be used for evaluator metrics that can only be computed in aggregate, like recall@precision.

save_options: Controls options for storing the evaluation result in the DB.
If a test_dataset is specified, by default that will also be saved as a Test Set.
project_id: The id of the project the evaluation result is saved in.
If not provided, will read from the LASTMILE_PROJECT_ID environment variable.
lastmile_api_token: The API token for the LastMile API. If not provided,
will try to get the token from the LASTMILE_API_TOKEN
environment variable.
You can create a token from the "API Tokens" section from this website:
https://lastmileai.dev/settings?page=tokens

"""

def run_and_evaluate(
run_fn: Callable[[str], str],
test_dataset_id: Optional[str] = None,
test_dataset: Optional[pd.DataFrame] = None,
evaluators:
Mapping[
str, # Name of the evaluation metric
Evaluator | EvaluatorTuple, # Tuple of Evaluator function and (optionally) Aggregator function
]
| set[str],
aggregated_evaluators: Optional[
Mapping[
str,
AggregatedEvaluator,
]
] = None,
save_options: Optional[SaveOptions] = None,
n_trials: int = 1,
project_id: Optional[str],
lastmile_api_token: Optional[str] = None,
) -> EvaluationResult:
"""
Similar to evaluate, except this function runs the flow on the inputs defined in `test_dataset`
and evaluates the result using the chosen evaluation functions.

By default, each run saves a new Test Set in the DB that can be used to run future evaluations.

test_dataset_id: (Must be specified if `test_dataset` isn't provided)
The id of the Test Set to run the evaluations on.
NOTE: Even if the test cases contain outputs already, they will be re-run using the `run_fn` to generate new outputs.
test_dataset: (If `test_dataset_id` isn't specified)
A DataFrame that must contain 'input' column and optionally 'groundTruth' columns to run the evaluations on.
NOTE: Even DataFrame contain an 'output' column already, any existing outputs will be re-run using the `run_fn` to generate new outputs.
NOTE: Some evaluators may need additional columns, such as 'context' (for hallucination eval, for example).

evaluators: A mapping of evaluator names to evaluator functions.
Each evaluator takes a DataFrame and runs the evaluators per row.
Optionally, you can also specify an Aggregator function to zip the evaluation metrics into a single value (e.g. mean, median, and so forth).

Aggregated evaluators allow you to do custom aggregations over all the DataFrame rows (for example, some specific recall@precision).
aggregated_evaluators: Like evaluators, but these functions take a DataFrame and produce a single value that aggregates over the entire dataset.
NOTE: This should only be used for evaluator metrics that can only be computed in aggregate, like recall@precision.

n_trials: Run each input N times each. Concretely, this means the evaluator will be run N*size(dataset).
save_options: Controls options for storing the evaluation result in the DB.
If a test_dataset is specified, by default that will also be saved as a Test Set.
project_id: The id of the project the evaluation result is saved in.
If not provided, will read from the LASTMILE_PROJECT_ID environment variable.
lastmile_api_token: The API token for the LastMile API. If not provided,
will try to get the token from the LASTMILE_API_TOKEN
environment variable.
You can create a token from the "API Tokens" section from this website:
https://lastmileai.dev/settings?page=tokens

"""

def run(
run_fn: Callable[[str], str],
inputs: Sequence[str] | pd.DataFrame,
project_id: Optional[str] = None,
lastmile_api_token: Optional[str] = None,
) -> pd.DataFrame:
"""
Runs the input data using the run_fn, and returns the results in an 'output' column in a DataFrame.
Importantly, this function wraps the run in a trace, so it can be tracked and evaluated easily.

run_fn: The callable to invoke the execution flow.
inputs: A DataFrame with an 'input' column, or a list of strings.
project_id: The id of the project the evaluation result is saved in.
If not provided, will read from the LASTMILE_PROJECT_ID environment variable.
lastmile_api_token: The API token for the LastMile API. If not provided,
will try to get the token from the LASTMILE_API_TOKEN
environment variable.
You can create a token from the "API Tokens" section from this website:
https://lastmileai.dev/settings?page=tokens
"""

Current API:

def run_query_function(
run_query_fn: Callable[[str], str],
inputs: Sequence[str] | pd.DataFrame,
project_name: Optional[str] = None,
) -> list[str]:

def evaluate(
project_id: Optional[str],
example_set_id: Optional[str] = None,
examples_dataframe: Optional[pd.DataFrame] = None,
evaluators: Optional[
Mapping[
str,
Evaluator,
]
| set[str]
] = None,
aggregated_evaluators: Optional[
Mapping[
str,
AggregatedEvaluator,
]
] = None,
save_options: Optional[SaveOptions] = None,
lastmile_api_token: Optional[str] = None,
) -> evaluation_lib.CreateEvaluationResponse:
"""
*Description*

Run evaluations on RAG query Examples using chosen evaluation functions.

project_id: Optionally, this allows you to group your evaluation results with other evaluations within the project.
example_set_id, examples_dataframe: give one of these to specify your evaluation inputs.
evaluators: A mapping of evaluator names to evaluator functions. Each evaluator takes a DataFrame and produces one value per row.
Example: {"exact_match": some_exact_match_checking_function}
aggregated_evaluators: Like evaluators, but these functions take a DataFrame and produce a single value that aggregates over the entire input.

save_options: Controls backend storage options for your Evaluation Result.

lastmile_api_token: You can get one here https://lastmileai.dev/settings?page=tokens.
If None, this function will try to load it from a local .env file.


*Input Data (Examples)*

A RAG query example is essentially a row of data
containing fields like `query`, `context`, `prompt`, `groundTruth`, etc.

Examples can contain any data from your RAG Query Traces, for example, as well as a groundTruth column.

The data is specified as either an example set ID or a DataFrame. If an example set ID is provided,
it will be downloaded from the LastMile API and evaluations will run locally.

If a DataFrame is provided, it will be used directly (also locally).

*Evaluators*

Each evaluator is a function that maps a DataFrame to a list of metric values, one float per row.
The idea is to apply an example-level evaluator to each row of the input DataFrame.

Accepts either mapping of callable or a set of predefined default evaluator names.

Aggregated evaluators allow you to do custom aggregations over all the DataFrame rows (for example, some specific recall@precision).
If not provided, a few defaults will be used.
"""

def run_and_evaluate(
project_id: Optional[str],
run_query_fn: Callable[[str], str],
input_set_id: Optional[str] = None,
inputs: Optional[list[str]] = None,
ground_truths: Optional[list[str]] = None,
evaluators: Optional[
Mapping[
str,
Evaluator,
]
| set[str]
] = None,
aggregated_evaluators: Optional[
Mapping[
str,
AggregatedEvaluator,
]
] = None,
save_options: Optional[SaveOptions] = None,
n_trials: int = 1,
lastmile_api_token: Optional[str] = None,
) -> evaluation_lib.CreateEvaluationResponse:
"""
*Description*

Run a RAG query flow function on the given inputs,
then run evaluations on corresponding RAG query outputs using chosen evaluation functions.

project_id: Optionally, this allows you to group your evaluation results with other evaluations within the project.
run_query_fn: This should run or simulate your RAG query flow. It must either return a string output,
or a tuple (string, string) representing (output, rag_query_trace_id).
If you return the tuple, the evaluation results will be connected to the trace in the UI.

input_set_id, inputs: give exactly one of these to specify your RAG system inputs (query time input).
ground_truths: Optionally, provide ground truths (references) for each of your inputs.
This is only accepted if you give a list for your inputs.
If you give input_set_id, the library will fetch your ground truths from that input set and you must not give ground truths as a function argument.



evaluators: A mapping of evaluator names to evaluator functions. Each evaluator takes a DataFrame and produces one value per row.
Example: {"exact_match": some_exact_match_checking_function}
aggregated_evaluators: Like evaluators, but these functions take a DataFrame and produce a single value that aggregates over the entire input.

save_options: Controls backend storage options for your Example Set and Evaluation Result.

n_trials: This allows you to simulate a larger Example sample set by using your RAG query inputs N times each.

lastmile_api_token: You can get one here https://lastmileai.dev/settings?page=tokens.
If None, this function will try to load it from a local .env file.


*Input Data (Examples)*


*Evaluators*

See `evaluate()`.
"""

Metrics APIs

def get_default_evaluators(
names: set[str],
lastmile_api_token: Optional[str] = None
) -> Mapping[
str,
Evaluator,
]
"""
Gets predefined evaluators that come built in with the LastMile Eval SDK.

names: Filter by these names, otherwise return all.
lastmile_api_token: The API token for the LastMile API. This is used for evaluator models hosted by LastMile.
If not provided, will try to get the token from the LASTMILE_API_TOKEN
environment variable.
You can create a token from the "API Tokens" section from this website:
https://lastmileai.dev/settings?page=tokens
"""


Current API:

def calculate_bleu_score(
outputs: Sequence[str], ground_truth: Sequence[str]
) -> list[float]:
"""
Calculate BLEU scores for a set of hypotheses against corresponding GT.


Args:
outputs (Sequence[str]): The generated outputs to evaluate.
ground_truth (Sequence[Sequence[str]]):
The reference outputs for evaluation.
Each set of ground_truth corresponds to one text.

Returns:
list[float]: A list of BLEU scores for each text-reference pair.

Raises:
ValueError: If the number of outputs and the number of sets of ground_truth are not equal.
"""

def calculate_exact_match_score(
outputs: Sequence[str], ground_truth: Sequence[str]
) -> list[float]:
"""
Calculate Exact Match score for a set of hypotheses against corresponding sets of reference outputs.


Args:
outputs (Sequence[str]): The generated outputs to evaluate.
ground_truth (Sequence[str]):
The reference outputs for evaluation.
Each set of ground_truth corresponds to one text.

Returns:
list[float]: A list of Exact Match scores for each text-reference pair.

Raises:
ValueError: If the number of outputs and the number of sets of ground_truth are not equal.
"""

def calculate_rouge1_score(
outputs: Sequence[str], ground_truth: Sequence[Sequence[str]]
) -> list[float]:
"""
Calculate Rouge-1 score for a set of hypotheses against corresponding sets of reference outputs.


Args:
outputs (Sequence[str]): The generated outputs to evaluate.
ground_truth (Sequence[Sequence[str]]):
The reference outputs for evaluation.
Each set of ground_truth corresponds to one text.

Returns:
list[float]: A list of Rouge-1 scores for each text-reference pair.

Raises:
ValueError: If the number of outputs and the number of sets of ground_truth are not equal.
"""

def calculate_relevance_score(
outputs: Sequence[str],
ground_truth: Sequence[str],
model_name: str = "gpt-3.5-turbo",
) -> list[float]:
"""
Evaluates the relevance of input strings against reference strings using a specific evaluation model,
and returns a list of float scores representing the relevance of each input-reference pair.

Args:
inputs (Sequence[str]): A sequence of input strings to be evaluated.
ground_truth (Sequence[str]): A sequence of reference strings to evaluate the inputs against.

Returns:
List[float]: A list of float scores indicating the relevance of each input-reference pair,
where 1.0 denotes 'relevant' and 0.0 denotes otherwise.
"""

def calculate_faithfulness_score(
outputs: Sequence[str],
ground_truth: Sequence[str],
inputs: Sequence[str],
lastmile_api_token: str,
) -> list[float]:

def calculate_toxicity_score(
outputs: Sequence[str],
model_name: str = "gpt-3.5-turbo",
) -> list[float]:
"""

Args:
outputs (Sequence[str]): A sequence of input strings to be evaluated.

Returns:
List[float]
"""

def calculate_qa_score(
outputs: Sequence[str],
ground_truth: Sequence[str],
inputs: Sequence[str],
model_name: str = "gpt-3.5-turbo",
) -> list[float]:
"""

Args:
outputs (Sequence[str]): A sequence of input strings to be evaluated.

Returns:
List[float]
"""

def calculate_summarization_score(
outputs: Sequence[str],
ground_truth: Sequence[str],
model_name: str = "gpt-3.5-turbo",
) -> list[float]:
"""
Args:
inputs (Sequence[str]): A sequence of input strings to be evaluated.
ground_truth (Sequence[str]): A sequence of reference strings to evaluate the inputs against.

Returns:
List[float]: A list of float scores indicating the summary quality of each input-reference pair,
where 1.0 denotes 'good' and 0.0 denotes otherwise.
"""

def calculate_human_vs_ai_score(
outputs: Sequence[str],
ground_truth: Sequence[str],
inputs: Sequence[str],
model_name: str = "gpt-3.5-turbo",
) -> list[float]:
"""
Args:
inputs (Sequence[str]): A sequence of input strings to be evaluated.
ground_truth (Sequence[str]): A sequence of reference strings to evaluate the inputs against.

Returns:
List[float]: A list of float scores indicating the summary quality of each input-reference pair,
where 1.0 denotes 'good' and 0.0 denotes otherwise.
"""

def calculate_custom_llm_metric_example_sentiment(
outputs: Sequence[str],
model_name: str = "gpt-3.5-turbo",
) -> list[float]:
"""

Args:
outputs (Sequence[str]): The generated texts to evaluate.
model_name (str): The name of the evaluation model to use.

Returns:
list[float]: A list of custom sentiment scores for each text.
"""

def calculate_custom_llm_metric_example_semantic_similarity(
outputs: Sequence[str],
ground_truth: Sequence[str],
model_name: str = "gpt-3.5-turbo",
) -> list[float]:
"""

Args:
outputs (Sequence[str]): The generated texts to evaluate.
ground_truth (Sequence[str]): The reference texts to evaluate against.
model_name (str): The name of the evaluation model to use.

Returns:
list[float]: A list of custom similarity scores for each text.
"""


Deprecated / WIP

list_example_sets

def list_example_sets(
take: int = 10,
timeout: int = 60,
lastmile_api_token: Optional[str] = None,
) -> core.JSONObject:
"""
Get a list of test sets from the LastMile API.

Args:
take: The number of test sets to return. The default is 10.
lastmile_api_token: The API token for the LastMile API. If not provided,
will try to get the token from the LASTMILE_API_TOKEN
environment variable.
You can create a token from the "API Tokens" section from this website:
{WEBSITE_BASE_URL}/settings?page=tokens
timeout: The maximum time in seconds to wait for the request to complete.
The default is 60.

Returns:
A dictionary containing the test sets.
"""

🚨 PLEASE READ - This API reference represents the Evaluation SDK from 4/24/24. This requires an update as we are making significant changes to the SDK.

download_rag_query_traces

Downloads the RAG query traces from the specified project.

def download_rag_query_traces(
lastmile_api_token: str | None = None,
project_id: str | None = None,
) -> core.DFRAGQueryTrace

Arguments:

  • lastmile_api_token - The API token to use for authentication. If not provided, the function attempts to get the token from the LASTMILE_API_TOKEN environment variable.
  • project_id - The ID of the project where the traces are stored.

Returns:

  • core.DFRAGQueryTrace - The downloaded RAG query traces.

create_test_set_from_rag_query_traces

Creates a test set from the given DataFrame of RAG query traces.

def create_test_set_from_rag_query_traces(
df_rag_query_traces: pd.DataFrame,
test_set_name: str,
ground_truth: list[str] | None = None,
lastmile_api_token: str | None = None,
) -> evaluation_lib.CreateTestSetsResult:

Arguments:

  • df_rag_query_traces - The DataFrame containing the RAG query traces.
  • test_set_name - The name of the test set to create.
  • ground_truth - Optional ground truth values to associate with the test set.
  • lastmile_api_token - Optional API token for authentication. If not provided, the function attempts to get the token from the LASTMILE_API_TOKEN environment variable.

Returns:

  • evaluation_lib.CreateTestSetsResult - The result of the test set creation.

download_test_set

Downloads the test set with the given ID.

def download_test_set(
test_set_id: str,
lastmile_api_token: str | None = None,
) -> pd.DataFrame

Arguments:

  • test_set_id - The ID of the test set to download.
  • lastmile_api_token - Optional API token for authentication. If not provided, the function attempts to get the token from the LASTMILE_API_TOKEN environment variable.

Returns:

  • pd.DataFrame - The downloaded test set.

list_test_sets

Gets a list of test sets from the LastMile API.

def list_test_sets(
take: int = 10,
lastmile_api_token: Optional[str] = None,
timeout: int = 60,
) -> dict[str, Any]

Arguments:

  • take - The number of test sets to return. Defaults to 10.
  • lastmile_api_token - Optional API token for authentication. If not provided, the function attempts to get the token from the LASTMILE_API_TOKEN environment variable.
  • timeout - The maximum time in seconds to wait for the request to complete. Defaults to 60.

Returns:

  • dict[str, Any] - A dictionary containing the test sets.

get_latest_test_set_id

Gets the ID of the latest test set.

def get_latest_test_set_id(
lastmile_api_token: Optional[str] = None,
) -> str

Arguments:

  • lastmile_api_token - Optional API token for authentication. If not provided, the function attempts to get the token from the LASTMILE_API_TOKEN environment variable.

Returns:

  • str - The ID of the latest test set.

run_and_store_evaluations

Runs the provided evaluators and stores the evaluation results so you can view results in the UI.

def run_and_store_evaluations(
test_set_id: str,
project_id: str,
trace_level_evaluators: dict[
str, Callable[[pd.DataFrame], Sequence[float]]
],
dataset_level_evaluators: dict[
str, Callable[[pd.DataFrame], Sequence[float]]
],
lastmile_api_token: str | None = None,
evaluation_set_name: str | None = None,
) -> evaluation_lib.CreateEvaluationsResult

Arguments:

  • test_set_id - The ID of the test set on which to run the evaluators.
  • project_id - The ID of the project where to store the evaluations.
  • trace_level_evaluators - A dictionary of evaluators to run at trace level. Each evaluator function should return a batch of floats.
  • dataset_level_evaluators - A dictionary of evaluators to run at dataset level. Each evaluator function should return a batch of floats.
  • lastmile_api_token - Optional API token for authentication. If not provided, the function attempts to get the token from the LASTMILE_API_TOKEN environment variable.
  • evaluation_set_name - Optional name for the set of evaluations. If not provided, defaults to "Evaluation Set".

Returns:

  • evaluation_lib.CreateEvaluationsResult - The result of storing the evaluations.

run_evaluations

Runs the provided evaluators on the given DataFrame of a test set. This will not store results anywhere so you will not see these results in the UI.

def run_evaluations(
df_test_set: pd.DataFrame,
trace_level_evaluators: dict[
str, Callable[[pd.DataFrame], Sequence[float]]
],
dataset_level_evaluators: dict[
str, Callable[[pd.DataFrame], Sequence[float]]
],
) -> tuple[pd.DataFrame | None, pd.DataFrame | None]

Arguments:

  • df_test_set - The DataFrame representing the test set.
  • trace_level_evaluators - A dictionary of evaluators to run at trace level. Each evaluator function should return a batch of floats.
  • dataset_level_evaluators - A dictionary of evaluators to run at dataset level. Each evaluator function should return a batch of floats.

Returns:

  • tuple[pd.DataFrame | None, pd.DataFrame | None] - A tuple containing the trace-level evaluations and the dataset-level evaluations.

store_evaluation_set_results

Stores the provided evaluation set results. This assumes you have already ran evaluators on your test set and want to store the results to view results in the UI.

def store_evaluation_set_results(
project_id: str,
df_metrics_trace_level: pd.DataFrame | None = None,
df_metrics_dataset_level: pd.DataFrame | None = None,
lastmile_api_token: str | None = None,
evaluation_set_name: str | None = None,
) -> evaluation_lib.CreateEvaluationsResult

Arguments:

  • project_id - The ID of the project where to store the results.
  • df_metrics_trace_level - Optional DataFrame containing the trace-level metrics. If not provided, no trace-level metrics are stored.
  • df_metrics_dataset_level - Optional DataFrame containing the dataset-level metrics. If not provided, no dataset-level metrics are stored.
  • lastmile_api_token - Optional API token for authentication. If not provided, the function attempts to get the token from the LASTMILE_API_TOKEN environment variable.
  • evaluation_set_name - Optional name for the set of evaluations. If not provided, defaults to "Evaluation Set".

Returns:

  • evaluation_lib.CreateEvaluationsResult - The result of storing the evaluations.