diff --git a/docusaurus.config.js b/docusaurus.config.js index d496858db..2aa88501f 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -46,12 +46,6 @@ const config = { label: "stable", badge: false, }, - old: { - label: "old", - banner: "unmaintained", - badge: false, - path: "old", - }, }, sidebarPath: require.resolve("./sidebars.js"), remarkPlugins: [ diff --git a/vercel.json b/vercel.json index e105ac49a..3e57e5c27 100644 --- a/vercel.json +++ b/vercel.json @@ -2,89 +2,37 @@ "cleanUrls": true, "trailingSlash": false, "redirects": [ - { - "source": "/evaluation/custom-evaluators", - "destination": "/evaluation/faq/custom-evaluators" - }, { "source": "/category/tracing", - "destination": "/tracing" - }, - { - "source": "category/testing--evaluation", - "destination": "/evaluation" - }, - { - "source": "/category/langsmith-cookbook", - "destination": "/cookbook" - }, - { - "source": "/tracing/tracing-faq", - "destination": "/tracing/faq" - }, - { - "source": "/category/organizations", - "destination": "/" - }, - { - "source": "/organizations", - "destination": "/" + "destination": "/observability" }, { "source": "/category/organizations/:path*", "destination": ":path*" }, - { - "source": "/cookbook/testing-examples/prod-candidate-testing", - "destination": "/cookbook/testing-examples/backtesting" - }, { "source": "/evaluation/faq/datasets-client", - "destination": "/evaluation/faq/manage-datasets#how-to-manage-datasets-programmatically" + "destination": "/evaluation/how_to_guides/manage_datasets_programmatically" }, { "source": "/evaluation/faq/datasets-webapp", - "destination": "/evaluation/faq/manage-datasets" - }, - { - "source": "/overview", - "destination": "/" - }, - { - "source": "/docs/:path*", - "destination": "/:path*" + "destination": "/evaluation/how_to_guides/manage_datasets_in_application#manage-a-dataset" }, { "source": "/monitoring/:path*", - "destination": "/old/monitoring/:path*" - }, - { - "source": "/proxy/:path*", - "destination": "/old/proxy/:path*" + "destination": "/observability/how_to_guides#monitoring" }, { "source": "/tracing/:path*", - "destination": "/old/tracing/:path*" - }, - { - "source": "/cookbook/:path*", - "destination": "/old/cookbook/:path*" + "destination": "/observability" }, { "source": "/hub/:path*", - "destination": "/old/hub/:path*" - }, - { - "source": "/user_guide", - "destination": "/old/user_guide" + "destination": "/prompt_engineering/how_to_guides#prompt-hub" }, { "source": "/category/prompt-hub/:path*", - "destination": "/old/category/prompt-hub/:path*" - }, - { - "source": "/category/proxy/:path*", - "destination": "/old/category/proxy/:path*" + "destination": "/prompt_engineering/how_to_guides#prompt-hub" }, { "source": "/category/release-notes", @@ -92,47 +40,35 @@ }, { "source": "/how_to_guides/evaluation/:path*", - "destination": "/evaluation/how_to_guides/evaluation/:path*" + "destination": "/evaluation/how_to_guides/evaluation" }, { "source": "/how_to_guides/human_feedback/:path*", - "destination": "/evaluation/how_to_guides/human_feedback/:path*" + "destination": "/evaluation/how_to_guides#annotation-queues-and-human-feedback" }, { "source": "/how_to_guides/datasets/:path*", - "destination": "/evaluation/how_to_guides/datasets/:path*" + "destination": "/evaluation/how_to_guides#dataset-management" }, { "source": "/how_to_guides/monitoring/:path*", - "destination": "/observability/how_to_guides/monitoring/:path*" + "destination": "/observability/how_to_guides" }, { "source": "/how_to_guides/tracing/:path*", - "destination": "/observability/how_to_guides/tracing/:path*" + "destination": "/observability/how_to_guides" }, { "source": "/how_to_guides/prompts/:path*", - "destination": "/prompt_engineering/how_to_guides/prompts/:path*" + "destination": "/prompt_engineering/how_to_guides#prompt-hub" }, { "source": "/how_to_guides/playground/:path*", - "destination": "/prompt_engineering/how_to_guides/playground/:path*" + "destination": "/prompt_engineering/how_to_guides#playground" }, { "source": "/how_to_guides/setup/:path*", - "destination": "/administration/how_to_guides/organization_management/:path*" - }, - { - "source": "/how_to_guides", - "destination": "/" - }, - { - "source": "/concepts", - "destination": "/" - }, - { - "source": "/tutorials", - "destination": "/" + "destination": "/administration/how_to_guides/organization_management" }, { "source": "/concepts/admin:path*", @@ -156,7 +92,7 @@ }, { "source": "/pricing:path*", - "destination": "/pricing" + "destination": "https://www.langchain.com/pricing-langsmith" }, { "source": "/tutorials/Developers/observability", @@ -220,7 +156,7 @@ }, { "source": "/evaluation/how_to_guides/unit_testing(/?)", - "destination": "/evauation/how_to_guides/pytest" + "destination": "/evaluation/how_to_guides/pytest" }, { "source": "/observability/how_to_guides/tracing/:path*", diff --git a/versioned_docs/version-old/evaluation/_check.py b/versioned_docs/version-old/evaluation/_check.py deleted file mode 100644 index ea9e8fcba..000000000 --- a/versioned_docs/version-old/evaluation/_check.py +++ /dev/null @@ -1,23 +0,0 @@ -import re - -with open('quickstart.mdx', 'r') as file: - all_mdx_content = file.read() - -code_block_pattern = re.compile(r'content: `([^`]*)`', re.DOTALL) - -by_code_tabs = all_mdx_content.split(" dict: - reference = example.outputs["answer"] - prediction = run.outputs["output"] - score = prediction.lower() == reference.lower() - return {"key": "exact_match", "score": score} -``` - -Let's break this down: - -- The evaluator function accepts a `Run` and `Example` and returns a dictionary with the evaluation key and score. The run contains the full trace of your pipeline, and the example contains the inputs and outputs for this data point. If your dataset contains labels, they are found in the `example.outputs` dictionary, which is kept separate to keep your model from cheating. -- In our dataset, the outputs have an "answer" key that contains the reference answer. Your pipeline generates predictions as a dictionary with an "output" key. -- It compares the prediction and reference (case-insensitive) and returns a dictionary with the evaluation key and score. - -You can use this evaluator directly in the `evaluate` function: - -```python -from langsmith.evaluation import evaluate - -evaluate( - , - data="", - evaluators=[exact_match], -) -``` - -#### Example 2: Parametrizing your evaluator - -You may want to parametrize your evaluator as a class. This is useful when you need to pass additional parameters to the evaluator. - -```python -from langsmith.evaluation import evaluate -from langsmith.schemas import Example, Run - -class BlocklistEvaluator: - def __init__(self, blocklist: list[str]): - self.blocklist = blocklist - def __call__( - self, run: Run, example: Example | None = None - ) -> dict: - model_outputs = run.outputs["output"] - score = not any([word in model_outputs for word in self.blocklist]) - return {"key": "blocklist", "score": score} - - -evaluate( - , - data="", - evaluators=[BlocklistEvaluator(blocklist=["bad", "words"])], -) -``` - -#### Example 3: Evaluating nested traces - -While most evaluations are applied to the inputs and outputs of your system, you can also evaluate all of the subcomponents that are traced within your pipeline. - -This is possible by stepping through the `run` object and collecting the outputs of each component. - -As a simple example, let's assume you want to evaluate the expected tools that are invoked in a pipeline. - -```python -from langsmith.evaluation import evaluate -from langsmith.schemas import Example, Run - -def evaluate_trajectory(run: Run, example: Example) -> dict: - # collect the tools on level 1 of the trace tree - steps = [child.name for child in run.child_runs if child.run_type == "tool"] - expected_steps = example.outputs["expected_tools"] - score = len(set(steps) & set(expected_steps)) / len(set(steps) | set(expected_steps)) - return {"key": "tools", "score": score} -``` - -This lets you grade the performance of intermediate steps in your pipeline. - -Note: the example above assumes tools are properly typed in the trace tree. - -#### Example 3: Structured Output - -With function calling, it has become easier than ever to generate feedback metrics using LLMs as a judge simply by specifying a Pydantic schema for the output. - -Below is an example (in this case using OpenAI's tool calling functionality) to evaluate RAG app faithfulness. - -````python -import json -from typing import List - -import openai -from langsmith.schemas import Example, Run -from pydantic import BaseModel, Field - -openai_client = openai.Client() - - -class Propositions(BaseModel): - propositions: List[str] = Field( - description="The factual propositions generated by the model" - ) - - -class FaithfulnessScore(BaseModel): - reasoning: str = Field(description="The reasoning for the faithfulness score") - score: bool - - -def faithfulness(run: Run, example: Example) -> dict: - # Assumes your RAG app includes the prediction in the "output" key in its response - response: str = run.outputs["output"] - # Assumes your RAG app includes the retrieved docs as a "context" key in the outputs - # If not, you can fetch from the child_runs of the run object - retrieved_docs: list = run.outputs["context"] - formatted_docs = "\n".join([str(doc) for doc in retrieved_docs]) - extracted = openai_client.chat.completions.create( - model="gpt-4-turbo-preview", - messages=[ - { - "role": "user", - "content": "Extract all factual propositions from the following text:\n\n" - f"```\n{response}\n```", - }, - ], - tools=[ - { - "type": "function", - "function": { - "name": "Propositions", - "description": "Use to record each factual assertion.", - "parameters": Propositions.model_json_schema(), - }, - } - ], - tool_choice={"type": "function", "function": {"name": "Propositions"}}, - ) - propositions = [ - prop - for tc in extracted.choices[0].message.tool_calls - for prop in json.loads(tc.function.arguments)["propositions"] - ] - scores, reasoning = [], [] - tools = [ - { - "type": "function", - "function": { - "name": "FaithfulnessScore", - "description": "Use to score how faithful the propositions are to the docs.", - "parameters": FaithfulnessScore.model_json_schema(), - }, - } - ] - for proposition in propositions: - faithfulness_completion = openai_client.chat.completions.create( - model="gpt-4-turbo-preview", - messages=[ - { - "role": "user", - "content": "Grade whether the proposition can be logically concluded" - f" from the docs:\n\nProposition: {proposition}\nDocs:\n" - f"```\n{formatted_docs}\n```", - }, - ], - # highlight-next-line - tools=tools, - tool_choice={"type": "function", "function": {"name": "FaithfulnessScore"}}, - ) - faithfulness_args = json.loads( - faithfulness_completion.choices[0].message.tool_calls[0].function.arguments - ) - scores.append(faithfulness_args["score"]) - reasoning.append(faithfulness_args["reasoning"]) - average_score = sum(scores) / len(scores) if scores else None - comment = "\n".join(reasoning) - return {"key": "faithfulness", "score": average_score, "comment": comment} - -```` - -#### Example 4: Returning Multiple Scores - -A single evaluator can return multiple scores. An example of when this might be useful is if you are using tool calling for an LLM-as-judge to extract multiple metrics in a single API call. - -```python -import json - -import openai -from langsmith.schemas import Example, Run -from pydantic import BaseModel, Field - -# Initialize the OpenAI client -openai_client = openai.Client() - -class Scores(BaseModel): - correctness_reasoning: str = Field(description="The reasoning for the correctness score") - correctness: float = Field(description="The score for the correctness of the prediction") - conciseness_reasoning: str = Field(description="The reasoning for the conciseness score") - conciseness: float = Field(description="The score for the conciseness of the prediction") - -def multiple_scores(run: Run, example: Example) -> dict: - reference = example.outputs["answer"] - prediction = run.outputs["output"] - - messages = [ - { - "role": "user", - "content": f"Reference: {reference}\nPrediction: {prediction}" - }, - ] - - tools = [ - { - "type": "function", - "function": { - "name": "Scores", - "description": "Use to evaluate the correctness and conciseness of the prediction.", - "parameters": Scores.model_json_schema(), - }, - } - ] - - # Generating the chat completion with structured output - completion = openai_client.chat.completions.create( - model="gpt-4-turbo-preview", - messages=messages, - tools=tools, - tool_choice={"type": "function", "function": {"name": "Scores"}}, - ) - - # Extracting structured scores from the completion - scores_args = json.loads(completion.choices[0].message.tool_calls[0].function.arguments) - - return { - # highlight-next-line - "results": [ - # Provide the key, score and other relevant information for each metric - # highlight-next-line - {"key": "correctness", "score": scores_args["correctness"], "comment": scores_args["correctness_reasoning"]}, - {"key": "conciseness", "score": scores_args["conciseness"], "comment": scores_args["conciseness_reasoning"]} - ] - } -``` - -#### Example 5: Perplexity Evaluator - -The flexibility of the functional interface means you can easly apply evaluators from any other libraries. For instance, you may want to use statistical measures such as [`perplexity`](https://huggingface.co/spaces/evaluate-metric/perplexity) to grade your run output. Below is an example using the [evaluate](https://huggingface.co/docs/evaluate/index) package by HuggingFace, which contains numerous commonly used metrics. Start by installing the `evaluate` package by running `pip install evaluate`. - -```python -from evaluate import load -from langsmith.schemas import Example, Run -from langsmith.evaluation import RunEvaluator - -class PerplexityEvaluator(RunEvaluator): - def __init__(self, prediction_key: Optional[str] = None, model_id: str = "gpt-2"): - self.prediction_key = prediction_key - self.model_id = model_id - self.metric_fn = load("perplexity", module_type="metric") - def evaluate_run( - self, run: Run, example: Example - ) -> dict: - if run.outputs is None: - raise ValueError("Run outputs cannot be None") - prediction = run.outputs[self.prediction_key] - results = self.metric_fn.compute( - predictions=[prediction], model_id=self.model_id - ) - ppl = results["perplexities"][0] - return {"key": "Perplexity", "score": ppl} -``` - -Let's break down what the `PerplexityEvaluator` is doing: - -- **Initialize**: In the constructor, we're setting up a few properties that will be needed later on. - - `prediction_key`: The key to find the model's prediction in the outputs of a run. - - `model_id`: The ID of the language model you want to use to compute the metric. In our example, we are using 'gpt-2'. - - `metric_fn`: The evaluation metric function, loaded from the HuggingFace `evaluate` package. -- **Evaluate**: This method takes a run (and optionally an example) and returns an evaluation dictionary. - - If the run outputs are `None`, the evaluator raises an error. - - Otherwise, the outputs are passed to the `metric_fn` to compute the perplexity. The perplexity score is then returned as part of the evaluation dictionary. - Once you've defined your evaluators, you can use them to evaluate your model: - -```python -from langsmith.evaluation import evaluate -evaluate( - , - data="", - evaluators=[BlocklistEvaluator(blocklist=["bad", "words"]), PerplexityEvaluator(), is_empty], -) -``` - -## Summary Evaluators - -Some metrics can only be defined on the entire experiment level as opposed to the individual runs of the experiment. For example, you may want to compute the f1 score of a classifier across all runs in an experiment kicked off from a dataset. These are called `summary_evaluators`. Instead of taking in a single `Run` and `Example`, these evaluators take a list of each. - -```python -from typing import List -from langsmith.schemas import Example, Run -from langsmith.evaluation import evaluate - -def f1_score_summary_evaluator(runs: List[Run], examples: List[Example]) -> dict: - true_positives = 0 - false_positives = 0 - false_negatives = 0 - for run, example in zip(runs, examples): - # Matches the output format of your dataset - reference = example.outputs["answer"] - # Matches the output dict in `predict` function below - prediction = run.outputs["prediction"] - if reference and prediction == reference: - true_positives += 1 - elif prediction and not reference: - false_positives += 1 - elif not prediction and reference: - false_negatives += 1 - if true_positives == 0: - return {"key": "f1_score", "score": 0.0} - - precision = true_positives / (true_positives + false_positives) - recall = true_positives / (true_positives + false_negatives) - f1_score = 2 * (precision * recall) / (precision + recall) - return {"key": "f1_score", "score": f1_score} - -def predict(inputs: dict): - return {"prediction": True} - -evaluate( - predict, # Your classifier - data="", - summary_evaluators=[f1_score_summary_evaluator], -) -``` - -### Recap - -Congratulations! You created a custom evaluation chain you can apply to _any_ traced run so you can surface more relevant information in your application. -LangChain's evaluation chains speed up the development process for application-specific, semantically robust evaluations. -You can also extend existing components from the library so you can focus on building your product. All your evals come with: - -- Automatic tracing integrations to help you debug, compare, and improve your code -- Easy sharing and mixing of components and results -- Out-of-the-box support for sync and async evaluations for faster runs diff --git a/versioned_docs/version-old/evaluation/faq/evaluator-implementations.mdx b/versioned_docs/version-old/evaluation/faq/evaluator-implementations.mdx deleted file mode 100644 index 80d3476d9..000000000 --- a/versioned_docs/version-old/evaluation/faq/evaluator-implementations.mdx +++ /dev/null @@ -1,338 +0,0 @@ ---- -sidebar_label: Use Off-the-Shelf Evaluators -sidebar_position: 2 ---- - -import { - CodeTabs, - PythonBlock, - TypeScriptBlock, -} from "@site/src/components/InstructionsWithCode"; - -# How to Use Off-the-Shelf Evaluators - -LangChain's evaluation module provides evaluators you can use as-is for common evaluation scenarios. - -It's easy to use these by passing them to the `evaluators` argument of the `evaluate()` function. - -Copy the code snippets below to get started. You can also configure them for your applications using the arguments mentioned in the "Parameters" sections. -If you don't see an implementation that suits your needs, you can learn how to create your own [Custom Run Evaluators](custom-evaluators) in the linked guide, or contribute an string evaluator to the [LangChain repository](https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/evaluation/). - -:::note -Most of these evaluators are useful but imperfect! We recommend against blind trust of any single automated metric and to always incorporate them as a part of a holistic testing and evaluation strategy. -Many of the LLM-based evaluators return a binary score for a given data point, so measuring differences in prompt or model performance are most reliable in aggregate over a larger dataset. -::: - -## Overview - -The following table enumerates the off-the-shelf evaluators available in LangSmith, along with their output keys and a simple code sample. - -| Evaluator name | Output Key | Simple Code Example | -| -------------------- | --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| QA | `correctness` | `LangChainStringEvaluator("qa")` | -| Contextual Q&A | `contextual accuracy` | `LangChainStringEvaluator("context_qa")` | -| Chain of Thought Q&A | `cot contextual accuracy` | `LangChainStringEvaluator("cot_qa")` | -| Criteria | Depends on criteria key | `LangChainStringEvaluator("criteria", config={ "criteria": })`

`criterion` may be one of the default implemented criteria: `conciseness`, `relevance`, `correctness`, `coherence`, `harmfulness`, `maliciousness`, `helpfulness`, `controversiality`, `misogyny`, and `criminality`.

Or, you may define your own criteria in a custom dict as follows:
`{ "criterion_key": "criterion description" }` | -| Labeled Criteria | Depends on criteria key | `LangChainStringEvaluator("labeled_criteria", config={ "criteria": })`

`criterion` may be one of the default implemented criteria: `conciseness`, `relevance`, `correctness`, `coherence`, `harmfulness`, `maliciousness`, `helpfulness`, `controversiality`, `misogyny`, and `criminality`.

Or, you may define your own criteria in a custom dict as follows:
`{ "criterion_key": "criterion description" }` | -| Score | Depends on criteria key | `LangChainStringEvaluator("score_string", config={ "criteria": , "normalize_by": 10 })`

`criterion` may be one of the default implemented criteria: `conciseness`, `relevance`, `correctness`, `coherence`, `harmfulness`, `maliciousness`, `helpfulness`, `controversiality`, `misogyny`, and `criminality`.

Or, you may define your own criteria in a custom dict as follows:
`{ "criterion_key": "criterion description" }`. Scores are out of 10, so normalize_by will cast this to a score from 0 to 1. | -| Labeled Score | Depends on criteria key | `LangChainStringEvaluator("labeled_score_string", config={ "criteria": , "normalize_by": 10 })`

`criterion` may be one of the default implemented criteria: `conciseness`, `relevance`, `correctness`, `coherence`, `harmfulness`, `maliciousness`, `helpfulness`, `controversiality`, `misogyny`, and `criminality`.

Or, you may define your own criteria in a custom dict as follows:
`{ "criterion_key": "criterion description" }`. Scores are out of 10, so normalize_by will cast this to a score from 0 to 1. | -| Embedding distance | `embedding_cosine_distance` | `LangChainStringEvaluator("embedding_distance")` | -| String Distance | `string_distance` | `LangChainStringEvaluator("string_distance", config={"distance": "damerau_levenshtein" })`

`distance` defines the string difference metric to be applied, such as `levenshtein` or `jaro_winkler`. | -| Exact Match | `exact_match` | `LangChainStringEvaluator("exact_match")` | -| Regex Match | `regex_match` | `LangChainStringEvaluator("regex_match")` | -| Json Validity | `json_validity` | `LangChainStringEvaluator("json_validity")` | -| Json Equality | `json_equality` | `LangChainStringEvaluator("json_equality")` | -| Json Edit Distance | `json_edit_distance` | `LangChainStringEvaluator("json_edit_distance")` | -| Json Schema | `json_schema` | `LangChainStringEvaluator("json_schema")` | - -## Correctness: QA evaluation - -QA evalutors help to measure the correctness of a response to a user query or question. If you have a dataset with reference labels or reference context docs, these are the evaluators for you! -Three QA evaluators you can load are: `"qa"`, `"context_qa"`, `"cot_qa"`. Based on our meta-evals, we recommend using `"cot_qa"` or a similar prompt for best results. - -- The `"qa"` evaluator ([reference](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.eval_chain.QAEvalChain.html#langchain-evaluation-qa-eval-chain-qaevalchain)) instructs an llm to directly grade a response as "correct" or "incorrect" based on the reference answer. -- The `"context_qa"` evaluator ([reference](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.eval_chain.ContextQAEvalChain.html#langchain.evaluation.qa.eval_chain.ContextQAEvalChain)) instructs the LLM chain to use reference "context" (provided throught the example outputs) in determining correctness. This is useful if you have a larger corpus of grounding docs but don't have ground truth answers to a query. -- The `"cot_qa"` evaluator ([reference](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.eval_chain.CotQAEvalChain.html#langchain.evaluation.qa.eval_chain.CotQAEvalChain)) is similar to the "context_qa" evaluator, except it instructs the LLMChain to use chain of thought "reasoning" before determining a final verdict. This tends to lead to responses that better correlate with human labels, for a slightly higher token and runtime cost. - -, - data="", - evaluators=[qa_evaluator, context_qa_evaluator, cot_qa_evaluator], - metadata={"revision_id": "the version of your pipeline you are testing"}, -)`), - ]} - groupId="client-language" -/> -You can customize the evaluator by specifying the LLM used to power its LLM chain -or even by customizing the prompt itself. Below is an example using an Anthropic -model to run the evaluator, and a custom prompt for the base QA evaluator. Check -out the reference docs for more information on the expected prompt format. -, - data="", - evaluators=[qa_evaluator, context_qa_evaluator, cot_qa_evaluator], -) -`), - ]} - groupId="client-language" -/> - -## Criteria Evaluators (No Labels) - -If you don't have ground truth reference labels, you can evaluate your run against a custom set of criteria using the `"criteria"` or `"score"` evaluators. These are helpful when there are high level semantic aspects of your model's output you'd like to monitor that aren't captured by other explicit checks or rules. - -- The `"criteria"` evaluator ([reference](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.CriteriaEvalChain)) instructs an LLM to assess if a prediction satisfies the given criteria, outputting a binary score -- The `"score_string"` evaluator ([reference](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.scoring.eval_chain.ScoreStringEvalChain.html#langchain.evaluation.scoring.eval_chain.ScoreStringEvalChain)) has the LLM score the prediction on a numeric scale (default 1-10) based on how well it satisfies the criteria - -, - data="", - evaluators=[ - criteria_evaluator, - score_evaluator - ], - metadata={"revision_id": "the version of your pipeline you are testing"}, -)`), - ]} - groupId="client-language" -/> - -::::tip Supported Criteria -Default criteria are implemented for the following aspects: conciseness, relevance, correctness, coherence, harmfulness, maliciousness, helpfulness, controversiality, misogyny, and criminality. -To specify custom criteria, write a mapping of a criterion name to its description, such as: - -``` - criterion = {"creativity": "Is this submission creative, imaginative, or novel?"} - criteria_evaluator = LangChainStringEvaluator( - "labeled_criteria", - config={"criteria": criterion} - ) -``` - -:::: -::::tip Interpreting the Score -Evaluation scores don't have an inherent "direction" (i.e., higher is not necessarily better). -The direction of the score depends on the criteria being evaluated. For example, a score of 1 for "helpfulness" means that the prediction was deemed to be helpful by the model. -However, a score of 1 for "maliciousness" means that the prediction contains malicious content, which, of course, is "bad". -:::: - -## Criteria Evaluators (With Labels) - -If you have ground truth reference labels, you can evaluate your run against custom criteria while also providing that reference information to the LLM using the `"labeled_criteria"` or `"labeled_score_string"` evaluators. - -- The `"labeled_criteria"` evaluator ([reference](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.LabeledCriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.LabeledCriteriaEvalChain)) instructs an LLM to assess if a prediction satisfies the criteria, taking into account the reference label -- The `"labeled_score_string"` evaluator ([reference](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.scoring.eval_chain.LabeledScoreStringEvalChain.html#langchain.evaluation.scoring.eval_chain.LabeledScoreStringEvalChain)) has the LLM score the prediction on a numeric scale based on how well it satisfies the criteria compared to the reference - -, - data="", - evaluators=[ - labeled_criteria_evaluator, - labeled_score_evaluator - ], - metadata={"revision_id": "the version of your pipeline you are testing"}, -)`), - ]} - groupId="client-language" -/> - -## JSON Evaluators - -Evaluating extraction and function calling applications often comes down to validating that the LLM's string output can be parsed correctly and comparing it to a reference object. The JSON evaluators provide functionality to check your model's output consistency: - -- The `"json_validity"` ([reference](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.parsing.base.JsonValidityEvaluator.html#langchain.evaluation.parsing.base.JsonValidityEvaluator)) evaluator checks if a prediction is valid JSON -- The `"json_equality"` ([reference](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.parsing.base.JsonEqualityEvaluator.html#langchain.evaluation.parsing.base.JsonEqualityEvaluator)) evaluator checks if a JSON prediction exactly matches a JSON reference, after normalization -- The `"json_edit_distance"` ([reference](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.parsing.json_distance.JsonEditDistanceEvaluator.html#langchain.evaluation.parsing.json_distance.JsonEditDistanceEvaluator)) evaluator computes the normalized edit distance between a JSON prediction and reference -- The `"json_schema"` ([reference](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.parsing.json_schema.JsonSchemaEvaluator.html#langchain.evaluation.parsing.json_schema.JsonSchemaEvaluator)) evaluator checks if a JSON prediction satisfies a provided JSON schema - -, - data="", - evaluators=[ - json_validity_evaluator, - json_equality_evaluator, - json_edit_distance_evaluator, - json_schema_evaluator - ], -)`), - ]} - groupId="client-language" -/> - -## String or Embedding Distance - -To measure the similarity between a predicted string and a reference, you can use string distance metrics: - -- The `"string_distance"` ([reference](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.string_distance.base.StringDistanceEvalChain.html#langchain.evaluation.string_distance.base.StringDistanceEvalChain)) evaluator computes a normalized string edit distance between the prediction and reference -- The `"embedding_distance"` ([reference](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.embedding_distance.base.EmbeddingDistance.html#langchain.evaluation.embedding_distance.base.EmbeddingDistance)) evaluator computes the distance between the text embeddings of the prediction and reference -- The `"exact_match"` ([reference](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.exact_match.base.ExactMatchStringEvaluator.html#langchain.evaluation.exact_match.base.ExactMatchStringEvaluator)) evaluator checks for an exact string match between prediction and reference - -, - data="", - evaluators=[ - string_distance_evaluator, - embedding_distance_evaluator, - exact_match_evaluator - ], -)`), - ]} - groupId="client-language" -/> - -## Regex Match - -To evaluate predictions against a reference regular expression pattern, you can use the `"regex_match"` ([reference](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.regex_match.base.RegexMatchStringEvaluator.html#langchain.evaluation.regex_match.base.RegexMatchStringEvaluator)) evaluator. -The pattern is provided as a string in the example outputs of the dataset. The evaluator checks if the prediction matches the pattern. - -, - data="", - evaluators=[regex_evaluator], -)`), - ]} - groupId="client-language" -/> - -## Don't see what you're looking for? - -These implementations are just a starting point. We want to work with you to build better off-the-shelf evaluation tools for everyone. -We'd love feedback and contributions! Send us feedback at support@langchain.dev, check out the [Evaluators](https://python.langchain.com/docs/guides/evaluation/) in LangChain or submit PRs or issues directly to better address your needs. diff --git a/versioned_docs/version-old/evaluation/faq/experiments-app.mdx b/versioned_docs/version-old/evaluation/faq/experiments-app.mdx deleted file mode 100644 index a74b3ac39..000000000 --- a/versioned_docs/version-old/evaluation/faq/experiments-app.mdx +++ /dev/null @@ -1,54 +0,0 @@ ---- -sidebar_label: Run Experiments in Browser (no code) -sidebar_position: 8 ---- - -# How to run experiments in the prompt playground (no code) - -While you can kick off experiments easily using the sdk, as outlined [here](../quickstart), it's often useful to run experiments directly in the [prompt playground](../../hub/quickstart#2-try-out-a-prompt-in-the-playground). - -This allows you to test your prompt / model configuration over a series of inputs to see how well it generalizes across different contexts or scenarios, without having to write any code. - -## Kicking off an experiment in the prompt playground - -1. **Navigate to the prompt playground** by clicking on "Hub" in the sidebar, then selecting a prompt from the list of available prompts or creating a new one. -2. **Select the "Switch to dataset" button** to switch to the dataset you want to use for the experiment. Please note that the dataset keys of the dataset inputs must match the input variables of the prompt. In the below sections, note that the selected dataset has inputs with keys "text", which correctly match the input variable of the prompt. Also note that there is a max capacity of 15 inputs for the prompt playground. - ![Switch to dataset](../static/switch_to_dataset.png) -3. **Click on the "Start" button** or CMD+Enter to start the experiment. This will run the prompt over all the examples in the dataset and create an entry for the experiment in the dataset details page. Note that you need to commit the prompt to the prompt hub before you can start the experiment to ensure it can be referenced in the experiment. The result for each input will be streamed and displayed inline for each input in the dataset. - ![Input variables](../static/input_variables_playground.png) -4. **View the results** by clicking on the "View Dataset Run Table" button. This will take you to the experiment details page where you can see the results of the experiment. - ![View results](../static/view_results.png) -5. **Navigate back to the commit page** by clicking on the "View Commit" button. This will take you back to the prompt page where you can make changes to the prompt and run more experiments. The "View Commit" button is available to all experiments that were run from the prompt playground. The experiment is prefixed with the prompt repository name, a unique identifier, and the date and time the experiment was run. - ![Playground experiment results](../static/playground_experiment_results.png) - -## Adding evaluation scores to the experiment - -Kicking off an experiment is no fun without actually running evaluations on the results. You can add evaluation scores to the experiment by configuring an automation rule for the dataset, again without writing any code. This will allow you to add evaluation scores to the experiment and compare the results across different experiments. -It's also possible to add human annotations to the runs of any experiment. - -We currently support configuring LLM-as-a-judge evaluators on datasets that will evaluate the results of each run in each experiment kicked off from that dataset. - -The process for configuring this is very similar to the process for configuring an [online evaluator](../../monitoring/faq/online_evaluation) for your tracing projects. - -1. **Navigate to the dataset details page** by clicking "Datasets and Testing" in the sidebar and selecting the dataset you want to configure the evaluator for. -2. **Click on the "Add Evaluator" button** to add an evaluator to the dataset. This will open a modal you can use to configure the evaluator. - - ![Add Evaluator](../static/add_evaluator.png) - -3. **Give your evaluator a name** and **set an inline prompt or load a prompt from the prompt hub** that will be used to evaluate the results of the runs in the experiment. - - ![Add evaluator name and prompt](../static/create_evaluator.png) - - Importantly, evaluator prompts can only contain the following input variables: - - - `input` (required): the input to the target you are evaluating - - `output` (required): the output of the target you are evaluating - - `reference`: the reference output, taken from the dataset - - You can specify the scoring criteria in the "schema" field. In this example, we are asking the LLM to grade on "correctness" of the output with respect to the reference, with a boolean output of 0 or 1. The name of the field in the schema will be interpreted as the feedback key and the type will be the type of the score. - - ![Evaluator prompt](../static/evaluator_prompt.png) - -4. **Save the evaluator** and navigate back to the dataset details page. Each subsequent experiment run from the dataset will now be evaluated by the evaluator you configured. Note that in the below image, each run in the experiment has a "correctness" score. - - ![Playground evaluator results](../static/playground_evaluator_results.png) diff --git a/versioned_docs/version-old/evaluation/faq/index.mdx b/versioned_docs/version-old/evaluation/faq/index.mdx deleted file mode 100644 index c12707503..000000000 --- a/versioned_docs/version-old/evaluation/faq/index.mdx +++ /dev/null @@ -1,26 +0,0 @@ ---- -sidebar_label: How-To Guides -sidebar_position: 0 ---- - -# How-To Guides - -In this section you will find guides for how to do specific evaluation related things. - -**Datasets** - -- [How to manage datasets in the app](faq/manage-datasets) -- [How to manage datasets programmatically](faq/manage-datasets#manage-datasets-programmatically) -- [How to list datasets from the client](faq/manage-datasets#list-datasets-from-the-client) -- [How to version datasets](faq/version-datasets) -- [How to list datapoints from the client](faq/manage-datasets#list-examples-from-the-client) -- [How to use synthetic data](faq/synthetic-data) - -**Evaluators** - -- [How to create custom evaluators](faq/custom-evaluators) -- [How to use off-the-shelf LangChain evaluators](faq/evaluator-implementations) - -**Experiments** - -- [How to run experiments in the prompt playground](faq/experiments-app) diff --git a/versioned_docs/version-old/evaluation/faq/manage-datasets.mdx b/versioned_docs/version-old/evaluation/faq/manage-datasets.mdx deleted file mode 100644 index 7f02ea2d5..000000000 --- a/versioned_docs/version-old/evaluation/faq/manage-datasets.mdx +++ /dev/null @@ -1,362 +0,0 @@ ---- -sidebar_label: Manage Datasets -sidebar_position: 5 ---- - -import { - CodeTabs, - ShellBlock, - PythonBlock, - TypeScriptBlock, -} from "@site/src/components/InstructionsWithCode"; - -# Managing Datasets - -## In LangSmith - -The easiest way to interact with datasets is directly in the LangSmith app. Here, you can create and edit datasets and example rows. Below are a few ways to interact with them. - -### From Existing Runs - -We typically construct datasets over time by collecting representative examples from debugging or other runs. To do this, we first filter the runs to find the ones we want to add to the dataset. Then, we create a dataset and add the runs as examples. - -You can do this from any 'run' details page by clicking the 'Add to Dataset' button in the top right-hand corner. - -![Add to Dataset](../static/add_to_dataset.png) - -From there, we select the dataset to organize it in and update the ground truth output values if necessary. - -![Modify example](../static/modify_example.png) - -### Upload a CSV - -The easiest way to create a dataset from your own data is by clicking the 'upload a CSV dataset' button on the home page or in the top right-hand corner of the 'Datasets & Testing' page. - -![Upload CSV](../static/create_dataset_csv.png) - -Select a name and description for the dataset, and then confirm that the inferred input and output columns are correct. - -![Confirm Columns](../static/select_columns.png) - -### Exporting datasets to other formats - -You can export your LangSmith dataset to CSV or OpenAI evals format directly from the web application. - -To do so, click "Export Dataset" from the homepage. -To do so, select a dataset, click on "Examples", and then click the "Export Dataset" button at the top of the examples table. - -![Export Dataset Button](../static/export-dataset-button.png) - -This will open a modal where you can select the format you want to export to. - -![Export Dataset Modal](../static/export-dataset-modal.png) - -## How to manage datasets programmatically - -You can create a dataset from existing runs or upload a CSV file (or pandas dataframe in python). - -Once you have a dataset created, you can continue to add new runs to it as examples. We recommend that you organize datasets to target a single "task", usually served by a single chain or LLM. For more discussions on datasets and evaluations, check out the [recommendations](../recommendations). - -### Create from list of values - -The most flexible way to make a dataset using the client is by creating examples from a list of inputs and optional outputs. Below is an example. - -Note that you can add arbitrary metadata to each example, such as a note or a source. The metadata is stored as a dictionary. - - - -### Create from existing runs - -To create datasets from existing runs, you can use the same approach. Below is an example: - - - -### Create dataset from CSV - -In this section, we will demonstrate how you can create a dataset by uploading a CSV file. - -First, ensure your CSV file is properly formatted with columns that represent your input and output keys. These keys will be utilized to map your data properly during the upload. You can specify an optional name and description for your dataset. Otherwise, the file name will be used as the dataset name and no description will be provided. - - - -### Create dataset from pandas dataframe - -The python client offers an additional convenience method to upload a dataset from a pandas dataframe. - -```python -from langsmith import Client -import os -import pandas as pd - -os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com" -os.environ["LANGCHAIN_API_KEY"] = "" -client = Client() - -df = pd.read_parquet('path/to/your/myfile.parquet') -input_keys = ['column1', 'column2'] # replace with your input column names -output_keys = ['output1', 'output2'] # replace with your output column names - -dataset = client.upload_dataframe( - df=df, - input_keys=input_keys, - output_keys=output_keys, - name="My Parquet Dataset", - description="Dataset created from a parquet file", - data_type="kv" # The default -) -``` - -## List datasets from the client - -You can programmatically fetch the datasets from LangSmith using the `list_datasets` method in the client. Below are some common examples: - -### Query all datasets - - - -### List datasets by name - -If you want to search by the exact name, you can do the following: - - - -If you want to do a case-invariant substring search, try the following: - - - -### List datasets by type - -You can filter datasets by type. Below is an example querying for chat datasets. - - - -## List Examples from the client - -Once you have a dataset created, you may want to download the examples. You can fetch dataset examples using the `list_examples` method on the LangSmith client. Below are some common calls: - -### List all examples for a dataset - -You can filter by dataset ID: - - - -Or you can filter by dataset name (this must exactly match the dataset name you want to query) - - - -### List examples by id - -You can also list multiple examples all by ID. - - diff --git a/versioned_docs/version-old/evaluation/faq/regression-testing.mdx b/versioned_docs/version-old/evaluation/faq/regression-testing.mdx deleted file mode 100644 index 6b0c750c2..000000000 --- a/versioned_docs/version-old/evaluation/faq/regression-testing.mdx +++ /dev/null @@ -1,40 +0,0 @@ ---- -sidebar_label: Regression Testing -sidebar_position: 3 ---- - -# Regression Testing - -When evaluating LLM applications, it is important to be able to track how your system performs over time. In this guide, we will show you how to use LangSmith's comparison view in -order to track regressions in your application, and drill down to inspect the specific runs that improved/regressed over time. - -## Overview - -In the LangSmith comparison view, runs that _regressed_ on your specified feedback key against your baseline experiment will be highlighted in red, while runs that _improved_ -will be highlighted in green. At the top of each column, you can see how many runs in that experiment did better and and how many did worse than your baseline experiment. - -![Regressions](../static/regression_view.png) - -## Baseline Experiment - -In order to track regressions, you need a baseline experiment against which to compare. This will be automatically assigned as the first experiment in your comparison, but you can -change it from the dropdown at the top of the page. - -![Baseline](../static/select_baseline.png) - -## Select Feedback Key - -You will also want to select the feedback key on which you would like focus. This can be selected via another dropdown at the top. Again, one will be assigned by -default, but you can adjust as needed. - -![Feedback](../static/select_feedback.png) - -## Filter to Regressions or Improvements - -Click on the regressions or improvements buttons on the top of each column to filter to the runs that regressed or improved in that specific experiment. - -![Regressions Filter](../static/filter_to_regressions.png) - -## Try it out - -To get started with regression testing, try [running a no-code experiment in our prompt playground](experiments-app) or check out the [Evaluation Quick Start Guide](../quickstart) to get started with the SDK. diff --git a/versioned_docs/version-old/evaluation/faq/synthetic-data.mdx b/versioned_docs/version-old/evaluation/faq/synthetic-data.mdx deleted file mode 100644 index a994d1c62..000000000 --- a/versioned_docs/version-old/evaluation/faq/synthetic-data.mdx +++ /dev/null @@ -1,155 +0,0 @@ ---- -sidebar_label: Synthetic Data for Evaluation -sidebar_position: 9 ---- - -# Synthetic Data for Evaluation - -When prototyping a system, you may not have enough real data to thoroughly evaluate how the component will behave. This guide will walk you through two techniques for generating synthetic data to augment your dataset: - -1. **Paraphrasing Existing Inputs**: Generate semantically similar variations of your existing examples to test your system's consistency. - -2. **Generating New Inputs**: Create entirely new, plausible inputs to test the how your system generalizes to other scenarios. - -:::note Reliability -Synthetic data is not a full substitute for real data. The quality of the data generated by these methods depends on factors like the model, prompt, and existing data. Always inspect synthetic datasets to ensure they capture the information you want to model and align with your use case. -::: - -### Prerequisites - -This guide assumes you've already connected to LangSmith and have a starter dataset. Though we will use LangChain core below, the technique is simple enough to apply using whatever tools you're comfortable with. - -```python -pip install langsmith langchain_openai -``` - -## Paraphrasing - -Paraphrasing existing inputs helps check if your chain's behavior is consistent across similar inputs. Since paraphrasing is a semantically invariant transformation, the outputs should remain the same as the original. Here's how to set it up: - -#### Step 1: Define the Paraphrase Generator - -Create a chain for generating paraphrases using the `ChatOpenAI` model with custom system prompts. - -```python -import re -from typing import List - -from langchain_openai import ChatOpenAI -from langchain_core.prompts import ( - ChatPromptTemplate, -) - -paraphrase_llm = ChatOpenAI(temperature=0.5) -prompt_template = ChatPromptTemplate.from_messages( - [ - ("system", - "You are a helpful paraphrasing assistant tasked with rephrasing text." - ), - ("system", "Input: {query}"), - ("user", - "What are {n_paraphrases} different ways you could paraphrase the INPUT text?" - " Do not significantly change the meaning." - " Respond using numbered bullets. If you cannot think of any," - " just say 'I don't know.'" - ), - ] -) - -def parse(output: str) -> List[str]: - return re.findall(r"\d+\.\s+(.*?)\n", output) - -paraphrase_chain = prompt | llm | parse -``` - -#### Step 2: Paraphrase the Dataset - -Use the `Client` from LangSmith to access your dataset and generate paraphrases for it. - -```python -from langsmith import Client - -client = Client() -n_paraphrases = 3 -dataset_name = "Your Dataset Name" - -examples = client.list_examples(dataset_name=dataset_name) -results = paraphrase_chain.batch( - [{"query": next(iter(example.inputs.values())), "n_paraphrases": n_paraphrases} - for example in examples] -) -inputs, outputs = [], [] -for example, batch_r in zip(examples, results): - input_key = next(iter(example.inputs)) - for r in result: - inputs.append({input_key: r}) - outputs.append(example.outputs) -client.create_examples( - inputs=inputs, - outputs=outputs, - dataset_name=dataset_name, -) -``` - -After running, your dataset should be roughly 3x the original size, with paraphrased variations of your original inputs. - -## Generating New Inputs - -To expand your dataset's semantic range and test your system's robustness, you can generate entirely new, plausible inputs. This method examines a random set of 5 existing examples and creates 6 novel ones that align with the inferred system but are distinct enough to have likely originated from different individuals. - -#### Step 1: Define the New Input Generator - -Create a chain for generating new inputs using the `ChatOpenAI` model with custom system prompts. - -```python -input_gen_llm = ChatOpenAI(temperature=0.5) -input_gen_prompt_template = ChatPromptTemplate.from_messages( - [ - ("system", - # Update this prompt to more closely match your use case - "You are a creative assistant tasked with coming up with new inputs for an application." - "\nThe following are some examples you can use to understand the domain:\n\n{examples}" - ), - ("user", - "Can you generate {n_inputs} unique and plausible inputs that could be asked by different users?" - ), - ] -) - -input_gen_chain = prompt | llm | parse -``` - -#### Step 2: Generate New Inputs for the Dataset - -Use the Client from LangSmith to access your dataset, sample a set of existing inputs, and generate new inputs based on them. Note that new inputs don't come with corresponding outputs, so you may need to manually label them or use a separate model to generate the outputs. - -```python -import random - -client = Client() -n_inputs = 6 -dataset_name = "Your Dataset Name" -sample_size = 5 - -examples = list(client.list_examples(dataset_name=dataset_name)) -example_inputs = [next(iter(example.inputs.values())) for example in random.sample(examples, sample_size)] -example_inputs_str = '\n'.join(f"- {input}" for input in example_inputs) -results = input_gen_chain.batch( - [{"examples": example_inputs_str, "n_inputs": n_inputs}] -) -inputs = [{"input": r} for r in results[0]] -outputs = [{}] * len(inputs) -client.create_examples( - inputs=inputs, - outputs=outputs, - dataset_name=dataset_name, -) -``` - -After running, your dataset should contain new examples that differ more significantly from the original ones, helping you test your system's robustness to a wider range of inputs. - -### Considerations - -Remember that the quality of the paraphrases and generated inputs will depend on the model and prompt used, and these approaches may not be appropriate for all use cases. Always check your augmented data to ensure it maintains the original meaning, aligns with the system's context, and is suitable for your application. - -Synthetic data is most useful early in the development process, when you're trying to gauge how sensitive your chain or model is to input variations. By combining paraphrasing, new input generation, and other augmentation methods, you can expand and diversify your dataset to verify the feasibility and robustness of a feature before deploying it to production. diff --git a/versioned_docs/version-old/evaluation/faq/unit-testing.mdx b/versioned_docs/version-old/evaluation/faq/unit-testing.mdx deleted file mode 100644 index 3bec87a10..000000000 --- a/versioned_docs/version-old/evaluation/faq/unit-testing.mdx +++ /dev/null @@ -1,324 +0,0 @@ ---- -sidebar_label: Unit Test -sidebar_position: 4 ---- - -# Unit Tests - -LangSmith unit tests are assertions and expectations designed to **quickly** identify obvious bugs and regressions in your AI system. Relative to evaluations, tests are designed to be **fast** and **cheap** to run, focusing on **specific** functionality and edge cases. -We recommend using LangSmith to track any unit tests that touch an LLM or other non-deterministic part of your AI system. - -:::note -`@unit` currently requires `langsmith` python version `>=0.1.42`. If you are interested in unit testing functionality in TypeScript or other languages, please let us know at [support@langchain.dev](mailto:support@langchain.dev). -::: - -## Write @unit test - -To write a LangSmith unit test, decorate your test function with `@unit`. -If you want to track the full nested trace of the system or component being tested, you can mark those functions with `@traceable`. For example: - -```python -# my_app/main.py -from langsmith import traceable -@traceable # Optional -def generate_sql(user_query): - # Replace with your SQL generation logic - # e.g., my_llm(my_prompt.format(user_query)) - return "SELECT * FROM customers" -``` - -Then define your unit test: - -```python tests/test_my_app.py -# tests/test_my_app.py -from langsmith import unit -from my_app.main import generate_sql - -@unit -def test_sql_generation_select_all(): - user_query = "Get all users from the customers table" - sql = generate_sql(user_query) - # LangSmith logs any exception raised by `assert` / `pytest.fail` / `raise` / etc. - # as a test failure - # highlight-next-line - assert sql == "SELECT * FROM customers" -``` - -## Run tests - -You can use a standard unit testing framework such as `pytest` ([docs](https://docs.pytest.org/en/7.1.x/contents.html)) to run. For example: - -```bash -pytest tests/ -``` - -Each time you run this test suite, LangSmith collects the pass/fail rate and other traces as a new `TestSuiteResult`, logging the `pass` rate (1 for pass, 0 for fail) over all the applicable tests. - -The test suite syncs to a corresponding dataset named after your package or github repository. - -![Unit Test Example](../static/unit-test-suite.png) - -## Going Further - -`@unit` is designed to stay out of your way and works well with familiar `pytest` features. For example: - -#### Defining inputs as fixtures - -Pytest fixtures let you define functions that serve as reusable inputs for your tests. LangSmith automatically syncs any test case inputs defined as fixtures. For example: - -```python -import pytest - -@pytest.fixture -def user_query(): - return "Get all users from the customers table" - -@pytest.fixture -def expected_sql(): - return "SELECT * FROM customers" - -# output_keys indicate which test arguments to save as 'outputs' in the dataset (Optional) -# Otherwise, all arguments are saved as 'inputs' -@unit(output_keys=["expected_sql"]) -def test_sql_generation_with_fixture(user_query, expected_sql): - sql = generate_sql(user_query) - assert sql == expected_sql -``` - -#### Parametrizing tests - -Parametrizing tests lets you run the same assertions across multiple sets of inputs. Use `pytest`'s `parametrize` decorator to achieve this. For example: - -```python -@unit -@pytest.mark.parametrize( - "user_query, expected_sql", - [ - ("Get all users from the customers table", "SELECT * FROM customers"), - ("Get all users from the orders table", "SELECT * FROM orders"), - ], -) -def test_sql_generation_parametrized(user_query, expected_sql): - sql = generate_sql(user_query) - assert sql == expected_sql -``` - -**Note:** as the parametrized list grows, you may consider using `evaluate()` instead. This parallelizes the evaluation and makes it easier to control individual experiments and the corresponding dataset. - -#### Expectations - -LangSmith provides an `expect` utility to help define expectations about your LLM output. For example: - -```python -from langsmith import expect - -@unit -def test_sql_generation_select_all(): - user_query = "Get all users from the customers table" - sql = generate_sql(user_query) - expect(sql).to_contain("customers") -``` - -This will log the binary "expectation" score to the experiment results, additionally `assert`ing that the expectation is met possibly triggering a test failure. - -`expect` also provides "fuzzy match" methods. For example: - -```python -@unit -@pytest.mark.parametrize( - "query, expectation", - [ - ("what's the capital of France?", "Paris"), - ], -) -def test_embedding_similarity(query, expectation): - prediction = my_chatbot(query) - expect.embedding_distance( - # This step logs the distance as feedback for this run - prediction=prediction, reference=expectation - # Adding a matcher (in this case, 'to_be_*"), logs 'expectation' feedback - ).to_be_less_than(0.5) # Optional predicate to assert against - expect.edit_distance( - # This computes the normalized Damerau-Levenshtein distance between the two strings - prediction=prediction, reference=expectation - # If no predicate is provided below, 'assert' isn't called, but the score is still logged - ) -``` - -This test case will be assigned 4 scores: - -1. The `embedding_distance` between the prediction and the expectation -2. The binary `expectation` score (1 if cosine distance is less than 0.5, 0 if not) -3. The `edit_distance` between the prediction and the expectation -4. The overall test pass/fail score (binary) - -The `expect` utility is modeled off of [Jest](https://jestjs.io/docs/expect)'s expect API, with some off-the-shelf functionality to make it easier to grade your LLMs. - -#### Dry-run mode - -If you want to run the tests without syncing the results to LangSmith, you can set `LANGCHAIN_TEST_TRACKING=false` in your environment. - -```bash -LANGCHAIN_TEST_TRACKING=false pytest tests/ -``` - -The tests will run as normal, but the experiment logs will not be sent to LangSmith. - -#### Caching - -LLMs on every commit in CI can get expensive. To save time and resources, LangSmith lets you cache results to disk. Any identical inputs will be loaded from the cache so you don't have to call out to your LLM provider unless there are changes to the model, prompt, or retrieved data. - -To enable caching, run with `LANGCHAIN_TEST_CACHE=/my/cache/path`. For example: - -```bash -LANGCHAIN_TEST_CACHE=tests/cassettes pytest tests/my_llm_tests -``` - -All requests will be cached to `tests/cassettes` and loaded from there on subsequent runs. If you check this in to your repository, your CI will be able to use the cache as well. - -#### Using `watch` mode - -With caching enabled, you can iterate quickly on your tests using `watch` mode without worrying about unnecessarily hitting your LLM provider. For example, using [`pytest-watch`](https://pypi.org/project/pytest-watch/): - -```bash -pip install pytest-watch -LANGCHAIN_TEST_CACHE=tests/cassettes ptw tests/my_llm_tests -``` - -## Explanations - -The `@unit` test decorator converts any unit test into a parametrized LangSmith example. By default, all unit tests within a given file will be grouped as a single "test suite" with a corresponding dataset. - -The following metrics are available off-the-shelf: - -| Feedback | Description | Example | -| -------------------- | ----------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------- | -| `pass` | Binary pass/fail score, 1 for pass, 0 for fail | `assert False` # Fails | -| `expectation` | Binary expectation score, 1 if expectation is met, 0 if not | `expect(prediction).against(lambda x: re.search(r"\b[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\b", x)` ) | -| `embedding_distance` | Cosine distance between two embeddings | expect.embedding_distance(prediction=prediction, reference=expectation) | -| `edit_distance` | Edit distance between two strings | expect.edit_distance(prediction=prediction, reference=expectation) | - -You can also log any arbitrary feeback within a unit test manually using the `client`. - -```python -from langsmith import unit, Client -from langsmith.run_helpers import get_current_run_tree - -client = Client() - -@unit -def test_foo(): - run_tree = get_current_run_tree() - client.create_feedback(run_id=run_tree.id, key="my_custom_feedback", score=1) -``` - -## Reference - -### `expect` - -`expect` makes it easy to make approximate assertions on test results and log scores to LangSmith. -Off-the-shelf, it allows you to compute and compare embedding distances, edit distances, and make custom assertions on values. - -#### `expect.embedding_distance(prediction, reference, *, config=None)` - -Compute the embedding distance between the prediction and reference. - -This logs the embedding distance to LangSmith and returns a [`Matcher`](#matcher) instance for making assertions on the distance value. - -By default, this uses the OpenAI API for computing embeddings. - -**Parameters** - -- `prediction` (str): The predicted string to compare. -- `reference` (str): The reference string to compare against. -- `config` (Optional[EmbeddingConfig]): Optional configuration for the embedding distance evaluator. Supported options: - - `encoder`: A custom encoder function to encode the list of input strings to embeddings. Defaults to the OpenAI API. - - `metric`: The distance metric to use for comparison. Supported values: "cosine", "euclidean", "manhattan", "chebyshev", "hamming". - -**Returns** - -A [`Matcher`](#matcher) instance for the embedding distance value. - -#### `expect.edit_distance(prediction, reference, *, config=None)` - -Compute the string distance between the prediction and reference. - -This logs the string distance (Damerau-Levenshtein) to LangSmith and returns a [`Matcher`](#matcher) instance for making assertions on the distance value. - -This depends on the `rapidfuzz` package for string distance computation. - -**Parameters** - -- `prediction` (str): The predicted string to compare. -- `reference` (str): The reference string to compare against. -- `config` (Optional[EditDistanceConfig]): Optional configuration for the string distance evaluator. Supported options: - - `metric`: The distance metric to use for comparison. Supported values: "damerau_levenshtein", "levenshtein", "jaro", "jaro_winkler", "hamming", "indel". - - `normalize_score`: Whether to normalize the score between 0 and 1. - -**Returns** - -A [`Matcher`](#matcher) instance for the string distance value. - -#### `expect.value(value)` - -Create a [`Matcher`](#matcher) instance for making assertions on the given value. - -**Parameters** - -- `value` (Any): The value to make assertions on. - -**Returns** - -A [`Matcher`](#matcher) instance for the given value. - -#### `Matcher` - -A class for making assertions on expectation values. - -**`to_be_less_than(value)`** - -Assert that the expectation value is less than the given value. - -**`to_be_greater_than(value)` ** - -Assert that the expectation value is greater than the given value. - -**`to_be_between(min_value, max_value)`** - -Assert that the expectation value is between the given min and max values. - -**`to_be_approximately(value, precision=2)`** - -Assert that the expectation value is approximately equal to the given value. - -**`to_equal(value)`** - -Assert that the expectation value equals the given value. - -**`to_contain(value)`** - -Assert that the expectation value contains the given value. - -**`against(func)`** - -Assert the expectation value against a custom function. - -### `unit` API - -The `unit` decorator is used to mark a function as a test case for LangSmith. It ensures that the necessary example data is created and associated with the test function. The decorated function will be executed as a test case, and the results will be recorded and reported by LangSmith. - -#### `@unit(id=None, output_keys=None, client=None, test_suite_name=None)` - -Create a unit test case in LangSmith. - -**Parameters** - -- `id` (Optional[uuid.UUID]): A unique identifier for the test case. If not provided, an ID will be generated based on the test function's module and name. -- `output_keys` (Optional[Sequence[str]]): A list of keys to be considered as the output keys for the test case. These keys will be extracted from the test function's inputs and stored as the expected outputs. -- `client` (Optional[ls_client.Client]): An instance of the LangSmith client to be used for communication with the LangSmith service. If not provided, a default client will be used. -- `test_suite_name` (Optional[str]): The name of the test suite to which the test case belongs. If not provided, the test suite name will be determined based on the environment or the package name. - -**Environment Variables** - -- `LANGSMITH_TEST_CACHE`: If set, API calls will be cached to disk to save time and costs during testing. Recommended to commit the cache files to your repository for faster CI/CD runs. Requires the 'langsmith[vcr]' package to be installed. -- `LANGSMITH_TEST_TRACKING`: Set this variable to the path of a directory to enable caching of test results. This is useful for re-running tests without re-executing the code. Requires the 'langsmith[vcr]' package. diff --git a/versioned_docs/version-old/evaluation/faq/version-datasets.mdx b/versioned_docs/version-old/evaluation/faq/version-datasets.mdx deleted file mode 100644 index a2558e242..000000000 --- a/versioned_docs/version-old/evaluation/faq/version-datasets.mdx +++ /dev/null @@ -1,66 +0,0 @@ ---- -sidebar_label: Version Datasets -sidebar_position: 6 ---- - -# How to version datasets - -## Basics - -Any time you _add_, _update_, or _delete_ examples in your dataset, a new version of your dataset is created. This allows you to track changes to your dataset over time and to understand how your dataset has evolved. - -By default, the version is defined by the timestamp of the change. When you click on a particular version of a dataset (by timestamp) in the "Examples" tab, you can see the state of the dataset at that point in time. - -![Version Datasets](../static/version_dataset.png) - -Note that examples are read-only when viewing a past version of the dataset. You will also see the operations that were between this version of the dataset and the "latest" version of the dataset. Also, by default the **latest version of the dataset is shown in the "Examples" tab** and experiments from **all versions are shown in the "Tests" tab**. - -In the "Tests" tab, you can see the results of tests run on the dataset at different versions. - -![Version Datasets](../static/version_dataset_tests.png) - -## Tagging versions - -You can also tag versions of your dataset to give them a more human-readable name. This can be useful for marking important milestones in your dataset's history. - -For example, you might tag a version of your dataset as "prod" and use it to run tests against your LLM pipeline. - -Tagging can be done in the UI by clicking on "+ Tag this version" in the "Examples" tab. - -![Tagging Datasets](../static/tag_this_version.png) - -You can also tag versions of your dataset using the SDK. Here's an example of how to tag a version of a dataset using the python SDK: - -```python -from langsmith import Client -fromt datetime import datetime - -client = Client() - -initial_time = datetime(2024, 1, 1, 0, 0, 0) # The timestamp of the version you want to tag - -# You can tag a specific dataset version with a semantic name, like "prod" -client.update_dataset_tag( - dataset_name=toxic_dataset_name, as_of=initial_time, tag="prod" -) -``` - -## Running experiments on specific versions of datasets - -You can execute an experiment on a specific version of a dataset in the sdk by using the `as_of` parameter in `list_examples`. `as_of` - -Here is an example of how to run an experiment on a specific version of a dataset using the python SDK: - -```python -from langsmith.evaluation import evaluate -from langsmith import Client - -client = Client() - -result = evaluate( - lambda inputs: label_query(**inputs), # Your target to evaluate, [defined elsewhere] - data=client.list_examples(dataset_name="my_dataset", as_of="prod"), - evaluators=[correct_label], # Your evaluators, [defined elsewhere] - experiment_prefix="dataset versioning example", -) -``` diff --git a/versioned_docs/version-old/evaluation/index.mdx b/versioned_docs/version-old/evaluation/index.mdx deleted file mode 100644 index 77985cc02..000000000 --- a/versioned_docs/version-old/evaluation/index.mdx +++ /dev/null @@ -1,142 +0,0 @@ ---- -sidebar_label: Overview -sidebar_position: 0 ---- - -import ThemedImage from "@theme/ThemedImage"; - -# Evaluation Overview - -## What are evaluations? - -Evaluations allow you to understand the performance of your LLM application over time. At its core, an evaluation is a function that takes in a set of inputs and outputs from your chain, agent, or model, -and returns a score (or multiple scores). This score may be based on comparing the outputs with reference outputs (e.g. with string matching or using an LLM as a judge). -However, there are also evaluators that don't require a reference output - for example, one that checks if the output is valid JSON, which is a common requirement in LLM applications. -LangSmith allows you to run evaluations on your application via `Datasets`, which are made up of `Examples`. - -## Components of an evaluation pipeline - -The following diagram outlines the building blocks for evaluations in LangSmith. `Datasets` define the inputs over which you run your chain, model, or agent (the `Task`), -and optionally the reference outputs against which your evaluator will compare the outputs of your `Task`. These datasets can be from any number of sources - -you might manually curate them, collect them from user input/feedback, or generate them via LLM. Your `Evaluator` can be any arbitrary function which returns a score -based on the inputs and outputs of your `Task`, and the reference output if desired. You can also use [one of LangSmith's off-the-shelf -evaluators](evaluation/faq/evaluator-implementations) to get started quickly! - - - -## Datasets - -`Datasets` are collections of `Examples`, the core building block for the evaluation workflow in LangSmith. -Examples provide the inputs over which you will be running your pipeline, -and, if applicable, the outputs that you will be comparing against. -All examples in a given dataset should follow the same schema. Examples contain an `"inputs"` dict and an `"output"` dict, -along with (optionally) a `metadata` dict. - -![Example](static/sample_langsmith_example.png) - -
- An Example in the LangSmith UI -
- -A single run of all your example inputs through your `Task` is called an `Experiment`. In LangSmith, you can easily view all the experiments that are associated -with your dataset, and track your application's performance over time! - -![Dataset](static/sample_langsmith_dataset.png) - -
- A Dataset in the LangSmith UI -
- -### Creating Datasets - -Datasets in LangSmith can be created in two main ways: - -- [In the LangSmith SDK](evaluation/faq/manage-datasets#how-to-manage-datasets-programmatically) with `create_dataset`. -- [In the LangSmith UI](evaluation/faq/manage-datasets) by clicking "New Dataset" from the [LangSmith datasets page](https://smith.langchain.com/datasets). These can - be uploaded as a CSV, or you can manually create examples in the UI. - -### Types of Datasets - -Dataset types communicate common input and output schemas. There are three types of datasets in LangSmith: `kv`, `llm`, and `chat`. `kv` datasets are the default type, and are -sufficient for almost all use-cases. `llm` and `chat` datasets can be useful to conveniently export datasets into known fine-tuning formats. - -- `kv`: In `kv` datasets, inputs and outputs can be arbitrary key-value pairs. These are useful when evaluating chains and agents that require - multiple inputs or that return multiple outputs. - The tradeoff with these datasets is that running evaluations on them can be a bit more involved. If there are multiple keys, you will have to manually specify the `prepare_data` - function in any off-the-shelf evaluators so they know what information to consider in generating a score. -- `llm`: `llm` datasets correspond to the string inputs and outputs from the "completion" style LLMS (string in, string out). The `"inputs"` dictionary contains a single `"input"` key mapped to a single prompt string. Similarly, the `"outputs"` dictionary contains a single `"output"` key mapped to a single response string. -- `chat`: `chat` datasets correspond to messages and generations from LLMs that expect structured "chat" messages as inputs and outputs. Each example row - expects an `"inputs"` dictionary containing a single `"input"` key mapped to a list of serialized chat messages. The `"outputs"` dictionary contains a single `"output"` key mapped to a single list of serialized chat messages. - -## Evaluators - -Evaluators are functions that help score how well your system did on a particular example. When running an evaluation, -your example inputs are run through your `Task` to produce `Runs`, which are then passed into your evaluator along with the `Example`. -The function then returns an `EvaluationResult`, which specifies your metric name and score. Evaluations in LangSmith are run via the `evaluate()` function. -The following diagram gives an overview of the data flow in an evaluation: - - -
- -The inputs to an evaluator consist of: - -1. An `Example` - the inputs for your pipeline and optionally the reference outputs or labels -2. A `Run` - observed output gathered from running the inputs through the `Task` - -An evaluator will then return an `EvaluationResult` (or similarly shaped dictionary), which is made up of: - -- `key`: The name the metric being evaluated -- `score`: The value of the metric on this example -- `comment`: the reasoning trajectory or other string information motivating the score - -### Types of Evaluators - -The evaluator itself can be any arbitrary function. There are a few different types of evaluators that are commonly used: - -- **Heuristics**: A heuristic evaluator is a hard-coded function that does some computation to determine a score. For example, you might write an - evaluator that checks whether the output of the system is an empty string, or determines if it is valid JSON. These would be considered **reference-free** evaluators, - as they don't consider any example output when making their decision. You might also want to check that the output of the system matches the reference output exactly, - which would be considered a **ground truth** evaluator because it compares the output to a reference. See [How to create custom evaluators](evaluation/faq/custom-evaluators). -- **LLM-as-judge**: An LLM-as-judge evaluator uses an LLM to score system output. For example, you might want to check whether your system is outputting - offensive content. This is **reference-free**, as there is no comparison to an example output. You might also want to check whether the system output has the same - meaning as the example output, which would be a **ground truth** evaluator. To get started with LLM-as-a-judge, try out LangSmith's [off-the-shelf evaluators](evaluation/faq/evaluator-implementations)! -- **Human**: You can also evaluate your runs manually. This can be done in LangSmith [via the SDK](tracing/faq/logging_feedback#capturing-feedback-programmatically), - or [in the LangSmith UI](tracing/faq/logging_feedback#annotating-traces-with-feedback). - -## Next steps - -To get started with code, check out the [Quick Start Guide](evaluation/quickstart). - -If you want to learn how to accomplish a particular task, check out our comprehensive [How-To Guides](evaluation/faq) - -For a higher-level set of recommendations on how to think about testing and evaluating your LLM app, check out the [evaluation recommendations](evaluation/recommendations) page. diff --git a/versioned_docs/version-old/evaluation/migration.mdx b/versioned_docs/version-old/evaluation/migration.mdx deleted file mode 100644 index 9aac37fce..000000000 --- a/versioned_docs/version-old/evaluation/migration.mdx +++ /dev/null @@ -1,199 +0,0 @@ ---- -sidebar_label: Migrating to `evaluate` -sidebar_position: 7 ---- - -# Migrating from `run_on_dataset` to `evaluate` - -In python, we've introduced a cleaner `evaluate()` function to replace the `run_on_dataset` function. While we are not deprecating the `run_on_dataset` function, the new function lets you get started and without needing to install `langchain` in your local environment. - -This guide will walk you through the process of migrating your existing code from using `run_on_dataset` to leveraging the benefits of `evaluate`. - -## Key Differences - -#### 1. `llm_or_chain_factory` -> first positional argument - -The "thing you are evaluating" (pipeline, target, model, chain, agent, etc.) is **always** the first positional argument and **always** has the following signature: - -```python -def predict(inputs: dict) -> dict: - """Call your model or pipeline with the given inputs and return the predictions.""" - # Example: - # result = client.chat.completions.create(...) - # response = result.choices[0].message.content - return {"output": ...} -``` - -No need to specify the confusing "`llm_or_chain_factory`". If you need to create a new version of your object for each data point, initialize it within the `predict()` function. -If you want to evaluate a LangChain object (runnable, etc.), you can directly call `evaluate(chain.invoke, data: ...,...)`. - -#### 2. `dataset_name` -> `data` - -The data field accepts a broader range of inputs, including the dataset name, id, or an iterator over examples. This lets you easily evaluate over a subset of the data to quickly debug. - -If you were previously specifying a `dataset_version`, you can directly pass the target version like so: - -```python -dataset_version = "lates" # your tagged version - -results = evaluate( - ..., - data=client.list_examples(dataset_name="my_dataset", as_of=dataset_version), - ... -) - -``` - -#### 3. `RunEvalConfig` -> `List[RunEvaluator]` - -The config has been deprecated (removing the LangChain dependency). Instead, directly provide a list of evaluators to the `evaluators` argument. - - a. Custom evaluators are simple functions that take a `Run` and an `Example` and return a dictionary with the evaluation results. For example: - -```python -def exact_match(run: Run, example: Example) -> dict: - """Calculate the exact match score of the run.""" - expected = example.outputs["answer"] - predicted = run.outputs["output"] - return {"score": expected.lower() == predicted.lower(), "key": "exact_match"} - -evaluate( - ..., - evaluators=[exact_match], - -) -``` - -Anything that subclasses `RunEvaluator` still works as they did before, we just will automatically promote your compatible functions to `RunEvaluator` instances. - - b. `LangChain` evaluators can be incorporated using the `LangChainStringEvaluator` wrapper. - -For example, if you were previously using the "Criteria" evaluator, this evaluation: - -```python -eval_config = RunEvalConfig( - evaluators=[RunEvalConfig.Criteria( - criteria={"usefulness": "The prediction is useful if..."}, - llm=my_eval_llm, - )] -) - -client.run_on_dataset(..., eval_config=eval_config) -``` - -becomes: - -```python -from langsmith.evaluation import LangChainStringEvaluator - -evaluators=[ - LangChainStringEvaluator( - "labeled_criteria", - config={ - "criteria": { - "usefulness": "The prediction is useful if...", - }, - "llm": my_eval_llm, - }, - ), -] -``` - -c. For evaluating multi-key datasets using off-the-shelf LangChain evaluators, replace any `input_key`, `reference_key`, `prediction_key` with a custom `prepare_data` function. - -If your dataset has a single key for the inputs and reference answer, and if your target pipeline returns a response in a single key, the evaluators can automatically use these responses directly without any additional configuration. - -For multi-key datasets, you must explain which values to use for the model prediction, (and optionally for the expected answer and/or inputs). This is done by providing a `prepare_data` function that converts a run and example to a dictionary of `{"input": ..., "prediction": ..., "reference": ...}`. - -```python -def prepare_data(run: Run, example: Example) -> dict: - # Run is the trace of your pipeline - # Example is a dataset record - return { - "prediction": run.outputs["output"], - "input": example.inputs["input"], - "reference": example.outputs["answer"], - } - -qa_evaluator = LangChainStringEvaluator( - "qa", - prepare_data=prepare_data, - config={"llm": my_qa_llm}, -) -``` - -#### 4. `batch_evaluators` -> `summary_evaluators`. - -These let you compute custom metrics over the whole dataset. For example, precision: - -```python -def precision(runs: List[Run], examples: List[Example]) -> dict: - """Calculate the precision of the runs.""" - expected = [example.outputs["answer"] for example in examples] - predicted = [run.outputs["output"] for run in runs] - tp = sum([p == e for p, e in zip(predictions, expected) if p == "yes"]) - fp = sum([p == "yes" and e == "no" for p, e in zip(predictions, expected)]) - return {"score": tp / (tp + fp), "key": "precision"} -``` - -#### 5. `project_metadata` -> `metadata`. - -#### 6. `project_name` -> `experiment_prefix`. - -`evaluate()` always appends an experiment uuid to the prefix to ensure uniqueness, so you don't have to -run into those confusing "project already exists" errors. - -#### 7. `concurrency_level` -> `max_concurrency`. - -## Migration Steps - -#### 1. Update your imports: - -```python -from langsmith.evaluation import evaluate -``` - -#### 2. Change your `run_on_dataset` call to `evaluate`: - -```python -results = evaluate( - ..., - data=..., - evaluators=[...], - summary_evaluators=[...], - metadata=..., - experiment_prefix=..., - max_concurrency=..., -) -``` - -#### 3. If you were using a factory function, replace it with a direct invocation: - -```python -def predict(inputs: dict): - my_pipeline = ... - return my_pipeline.invoke(inputs) -``` - -#### 4. If you were using LangChain evaluators, wrap them with `LangChainStringEvaluator`: - -```python -from langsmith.evaluation import LangChainStringEvaluator - -evaluators=[ - LangChainStringEvaluator("embedding_distance"), - LangChainStringEvaluator( - "labeled_criteria", - config={"criteria": {"usefulness": "The prediction is useful if..."}}, - prepare_data=prepare_criteria_data - ), -] -``` - -#### 5. Update any references to `project_metadata`, `project_name`, `dataset_version`, and `concurrency_level` to use the new argument names. - -## Support - -If you encounter any issues during the migration process or have further questions, please don't hesitate to reach out to our support team at [support@langchain.dev](mailto:support@langchain.dev). We're here to help ensure a smooth transition! - -Happy evaluating! diff --git a/versioned_docs/version-old/evaluation/quickstart.mdx b/versioned_docs/version-old/evaluation/quickstart.mdx deleted file mode 100644 index c2836ba58..000000000 --- a/versioned_docs/version-old/evaluation/quickstart.mdx +++ /dev/null @@ -1,193 +0,0 @@ ---- -sidebar_label: Quick Start -sidebar_position: 1 ---- - -import Tabs from "@theme/Tabs"; -import CodeBlock from "@theme/CodeBlock"; -import { - CodeTabs, - PythonBlock, - TypeScriptBlock, -} from "@site/src/components/InstructionsWithCode"; -import { ClientInstallationCodeTabs } from "@site/src/components/ClientInstallation"; - -# Evaluation Quick Start - -This guide helps you get started evaluating your AI system using LangSmith, so you can deploy the best perfoming model for your needs. This guide gets you started with the basics. - -## 1. Install LangSmith - - - -## 2. Evaluate - -Evalution requires a system to test, [data](faq) to serve as test cases, and optionally evaluators to grade the results. - - dict: - messages = [{"role": "user", "content": inputs["question"]}] - response = openai_client.chat.completions.create(messages=messages, model="gpt-3.5-turbo") - return {"output": response}\n -# Define evaluators -def must_mention(run: Run, example: Example) -> dict: - prediction = run.outputs.get("output") or "" - required = example.outputs.get("must_mention") or [] - score = all(phrase in prediction for phrase in required) - return {"key":"must_mention", "score": score}\n -experiment_results = evaluate( - predict, # Your AI system - data=dataset_name, # The data to predict and grade over - evaluators=[must_mention], # The evaluators to score the results - experiment_prefix="rap-generator", # A prefix for your experiment names to easily identify them - metadata={ - "version": "1.0.0", - }, -)`, - }, - { - value: "typescript", - label: "TypeScript", - language: "typescript", - content: `import { Client } from "langsmith"; -import { Run, Example } from "langsmith"; -import { EvaluationResult } from "langsmith/evaluation"; -// Note: native evaluate() function support coming soon to the LangSmith TS SDK -import { runOnDataset } from "langchain/smith"; -import OpenAI from "openai";\n -const client = new Client(); -// Define dataset: these are your test cases -const datasetName = "Rap Battle Dataset"; -const dataset = await client.createDataset(datasetName, { - description: "Rap battle prompts.", -}); -await client.createExamples({ - inputs: [ - {question: "a rap battle between Atticus Finch and Cicero"}, - {question: "a rap battle between Barbie and Oppenheimer"}, - ], - outputs: [ - {must_mention: ["lawyer", "justice"]}, - {must_mention: ["plastic", "nuclear"]}, - ], - datasetId: dataset.id, -});\n -// Define AI system -const openaiClient = new OpenAI();\n -async function predictResult({ question }: { question: string }) { - const messages = [{ "role": "user", "content": question }]; - const output = await openaiClient.chat.completions.create({ - model: "gpt-3.5-turbo", - messages: messages - }); - return { output }; -}\n -// Define evaluators -const mustMention = async ({ run, example }: { run: Run; example?: Example; }): Promise => { - const mustMention: string[] = example?.outputs?.must_contain ?? []; - const score = mustMention.every((phrase) => - run?.outputs?.output.includes(phrase) - ); - return { - key: "must_mention", - score: score, - }; -};\n -await runOnDataset( - predictResult, // Your AI system - datasetName, // The data to predict and grade over - { - evaluationConfig: {customEvaluators: [mustMention] - }, - projectMetadata: { - version: "1.0.0", - }, -});`, - }, - ]} - groupId="client-language" -/> - -Configure your API key, then run the script to evaluate your system. - -`, - }, - { - value: "typescript", - label: "TypeScript", - language: "bash", - content: `export LANGCHAIN_API_KEY=`, - }, - ]} - groupId="client-language" -/> - -## 3. Review Results - -The evaluation results will be streamed to a new experiment linked to your "Rap Battle Dataset". You can view the results by clicking on the link printed by the `evaluate` function or by navigating to the [Datasets & Testing](https://smith.langchain.com/datasets) page, clicking "Rap Battle Dataset", and viewing the latest test run. - -There, you can inspect the traces and feedback generated from the evaluation configuration. - -![Eval test run screenshot](static/eval-test-run.png) - -You can click "Open Run" to view the trace and feedback generated for that example. - -![Eval trace screenshot](static/eval-run-trace.png) - -To compare to another test on this dataset, you can click "Compare Tests". - -![Compare Tests](static/compare-tests.png) - -## More on evaluation - -Congratulations! You've now created a dataset and used it to evaluate your agent or LLM. -To learn how to make your own custom evaluators, review the [Custom Evaluator](faq) guide. To learn more about some pre-built evaluators available in the LangChain open-source library, check out the [LangChain Evaluators](faq/evaluator-implementations) guide. diff --git a/versioned_docs/version-old/evaluation/recommendations.mdx b/versioned_docs/version-old/evaluation/recommendations.mdx deleted file mode 100644 index d6cc0b293..000000000 --- a/versioned_docs/version-old/evaluation/recommendations.mdx +++ /dev/null @@ -1,101 +0,0 @@ ---- -sidebar_label: Recommendations -sidebar_position: 6 ---- - -# Recommendations - -This conceptual guide shares thoughts on how to use testing and evaluations for your LLM applications. There is no one-size-fits-all solution, but we believe the most successful teams will adapt strategies from design, software development, and machine learning to their use cases to deliver better, more reliable results. - -### Test early and often - -While "unit tests" don't truly exist for the model, writing "minimum functionality tests" for each chain during the debugging and prototyping stage will help you scaffold more reliable systems. - -Datasets facilitate this. Using debugging projects, you can log runs while prototyping. From these runs, you can select representative samples to add to a "Functionality Test Dataset" for that component. Evaluators can be run in CI to ensure that individual chains (or other components) still perform as desired for known use cases. - -- Completing a specific structured schema -- Selecting the correct tool for a given question -- Extracting the correct information from a passage -- Generating valid code for a typical input. -- Avoiding unwanted behavior when given leading inputs (e.g., appropriate polite refusals, adversarial inputs, etc.) - -These datasets can range from anywhere between 10-100+ examples and will continue to grow as you capture more example traces and add them as known tests. - -### Create domain-specific evaluators - -LangChain has strong and configurable built-in evaluators for common tasks, and everyone will benefit from your [contributions to these evaluators](https://github.com/langchain-ai/langchain/tree/master/libs/langchain/langchain/evaluation). However, often the best evaluation metrics are domain-specific. Some examples include: - -- Evaluate the validity and efficiency of domain-specific code -- Applying custom rules to check the response output against a proprietary system -- Asserting numerical values are within a certain range - -### Use labels where possible - -When adding examples to a dataset, you can improve the output to represent a "gold standard" label. Evaluators that compare outputs against labels generally are much more reliable than those that have to operate "reference-free." - -Once you have deployed an application, capture and filter user feedback (even in testing deployments) to help improve the signal. - -### Use aggregate evals - -Testing individual data points is useful for asserting behavior on known examples, but a lot of information can only be measured in aggregate. Aggregating automated feedback over a dataset can help you detect differences in performance across component versions or between configurations. - -These datasets are usually 100-1000+ examples to return statistically significant results. - -### Measure model stability - -LLMs can be non-deterministic. They also can be sensitive to small (even imperceptible) changes in the input. Generating a dataset of synthetic examples is a good way to measure this. Some common approaches to address this usually start from a representative dataset and then: - -- Generate examples using explicit transformations that don't change the meaning of the input, such as changing pronouns or roles, verb tense, misspellings, paraphrasing, etc. These are semantic invariance tests -- Generate "similar examples" from the model (or differently tokenized LLMs). When evaluating, ensure that the correctness or other metrics don't change, or ask the model to assert whether the outputs are equivalent. -- (If the model's temperature > 0) run the model multiple times and grade whether outputs are consistent. - -### Measure performance on subsets - -Use tags or organize datasets based on cohorts or important properties to return stratified results for different groups. This can help you quantify your application's bias or other issues that might not be apparent when looking at intermingled results. - -### Evaluate production data - -Once you have deployed an application, you can use the same evaluators to measure performance or behavior on real user data. This can help you identify issues that might not be apparent during testing, and it can help quantify signals that are contained in the unstructured data. These can be used alongside other application metrics to help better understand ways to improve your application. - -You can also log proxy metrics (such as click-through/response rate) as feedback to measure to drive better analysis. - -Check out some cookbook examples for this: - -- [Evaluate production runs (batch)](https://github.com/langchain-ai/langsmith-cookbook/blob/main/feedback-examples/algorithmic-feedback/algorithmic_feedback.ipynb): automate feedback metrics for advanced monitoring and performance tuning. -- [Evaluate production runs (real-time)](https://github.com/langchain-ai/langsmith-cookbook/blob/main/feedback-examples/realtime-algorithmic-feedback/realtime_feedback.ipynb): automatically generate feedback metrics for every run using an async callback. - -### Don't train on test datasets - -If you've ever heard of the "train, validation, test" splits in ML, you are well aware that if you use a dataset for optimizing a prompt, fine-tuning an LLM, or picking other configurable parameters in your setup, it's important to keep this separate from the datasets you use for testing. Otherwise, you risk "overfitting" to the test data, which will likely lead to poor performance on new data once you deploy it. - -### Test the model yourself - -Looking at the data remains an effective (albeit time-consuming) evaluation technique in many scenarios. You can evaluate runs yourself and log feedback from your application users using - -LangSmith exposes this in the client via a `create_feedback` method. We recommend adding as many signals as possible and using tags and aggregated feedback to help you understand what is happening in your application. - -### Ask appropriate questions - -When thinking of what evaluator to use, it is helpful to think of "what question I need to answer" to be sure that my application has the desired behavior. Then it's vital to decide "what question can I reasonably answer given the input data". - -Asking a model if an output is "correct" will likely return an unreliable result, unless the model has additional information (such as a ground-truth label) that wasn't available to the evaluated model. - -It can be useful to select evaluators based on whether labels are present and what types of information would be useful for your concerns. Below are a few examples: - -| | Reference Free | With References | -| ----------- | ---------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | -| Pass/Fail | Exact/fuzzy string match; "Does this output answer the user's question? YES or NO | "Is the generated output equivalent to the answer?" | -| Scoring | Perplexity / Normalized log probs; "On a scale from 1 to 5, 1 being a \_\_\_ and 5 being \_\_\_, how \_\_\_ is this output?" | ROUGE/BLEU; “On a scale from 1 to 5, how similar is …?” | -| Labeling | "Is this output mostly related to sports, finance, pop culture, or other?" | _less useful here_ | -| Comparisons | Which of these outputs best responds to the following input: 1. \_\_\_ 2. \_\_\_ | _less useful here_ | - -You'll notice that we've included some more traditional NLP measurements like "perplexity" or "ROUGE" alongside natural language questions prompted to an LLM. Both techniques have their place and very notable limitations. We recommend using a combination of these approaches to get a more complete picture of your application's performance. - -### Other Resources - -There's a lot of great work that has been done in this space. Some resources our community have found useful include: - -- ELeutherAI's [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness/) -- HuggingFace [Evaluate](https://huggingface.co/docs/evaluate/index) -- OpenAI's [evals](https://github.com/openai/evals/) repository -- Chatbot Arena [lmsys](https://chat.lmsys.org/) diff --git a/versioned_docs/version-old/evaluation/static/add_evaluator.png b/versioned_docs/version-old/evaluation/static/add_evaluator.png deleted file mode 100644 index 911bee1ad..000000000 Binary files a/versioned_docs/version-old/evaluation/static/add_evaluator.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/add_to_dataset.png b/versioned_docs/version-old/evaluation/static/add_to_dataset.png deleted file mode 100644 index 7692dbd94..000000000 Binary files a/versioned_docs/version-old/evaluation/static/add_to_dataset.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/compare-tests.png b/versioned_docs/version-old/evaluation/static/compare-tests.png deleted file mode 100644 index 9ab260ef7..000000000 Binary files a/versioned_docs/version-old/evaluation/static/compare-tests.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/create_dataset_csv.png b/versioned_docs/version-old/evaluation/static/create_dataset_csv.png deleted file mode 100644 index dfcc0d16f..000000000 Binary files a/versioned_docs/version-old/evaluation/static/create_dataset_csv.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/create_evaluator.png b/versioned_docs/version-old/evaluation/static/create_evaluator.png deleted file mode 100644 index 609384c0e..000000000 Binary files a/versioned_docs/version-old/evaluation/static/create_evaluator.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/eval-run-trace.png b/versioned_docs/version-old/evaluation/static/eval-run-trace.png deleted file mode 100644 index 778d92bb0..000000000 Binary files a/versioned_docs/version-old/evaluation/static/eval-run-trace.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/eval-test-run.png b/versioned_docs/version-old/evaluation/static/eval-test-run.png deleted file mode 100644 index b98455247..000000000 Binary files a/versioned_docs/version-old/evaluation/static/eval-test-run.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/evaluator_prompt.png b/versioned_docs/version-old/evaluation/static/evaluator_prompt.png deleted file mode 100644 index 4f939365a..000000000 Binary files a/versioned_docs/version-old/evaluation/static/evaluator_prompt.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/export-dataset-button.png b/versioned_docs/version-old/evaluation/static/export-dataset-button.png deleted file mode 100644 index 04270d7c7..000000000 Binary files a/versioned_docs/version-old/evaluation/static/export-dataset-button.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/export-dataset-modal.png b/versioned_docs/version-old/evaluation/static/export-dataset-modal.png deleted file mode 100644 index e470e36ff..000000000 Binary files a/versioned_docs/version-old/evaluation/static/export-dataset-modal.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/filter_to_regressions.png b/versioned_docs/version-old/evaluation/static/filter_to_regressions.png deleted file mode 100644 index 86b6c11e1..000000000 Binary files a/versioned_docs/version-old/evaluation/static/filter_to_regressions.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/input_variables_playground.png b/versioned_docs/version-old/evaluation/static/input_variables_playground.png deleted file mode 100644 index 86a762d24..000000000 Binary files a/versioned_docs/version-old/evaluation/static/input_variables_playground.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/langsmith_app_flow.png b/versioned_docs/version-old/evaluation/static/langsmith_app_flow.png deleted file mode 100644 index 377d32731..000000000 Binary files a/versioned_docs/version-old/evaluation/static/langsmith_app_flow.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/langsmith_app_flow_dark.png b/versioned_docs/version-old/evaluation/static/langsmith_app_flow_dark.png deleted file mode 100644 index a88f7cecd..000000000 Binary files a/versioned_docs/version-old/evaluation/static/langsmith_app_flow_dark.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/langsmith_evaluation_dark.png b/versioned_docs/version-old/evaluation/static/langsmith_evaluation_dark.png deleted file mode 100644 index 52dcd77fe..000000000 Binary files a/versioned_docs/version-old/evaluation/static/langsmith_evaluation_dark.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/langsmith_evaluators.png b/versioned_docs/version-old/evaluation/static/langsmith_evaluators.png deleted file mode 100644 index 1c530412c..000000000 Binary files a/versioned_docs/version-old/evaluation/static/langsmith_evaluators.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/langsmith_landscape_v2.png b/versioned_docs/version-old/evaluation/static/langsmith_landscape_v2.png deleted file mode 100644 index 132894887..000000000 Binary files a/versioned_docs/version-old/evaluation/static/langsmith_landscape_v2.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/langsmith_landscape_v2_dark.png b/versioned_docs/version-old/evaluation/static/langsmith_landscape_v2_dark.png deleted file mode 100644 index 01858f5f3..000000000 Binary files a/versioned_docs/version-old/evaluation/static/langsmith_landscape_v2_dark.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/langsmith_summary.png b/versioned_docs/version-old/evaluation/static/langsmith_summary.png deleted file mode 100644 index 7855d68b9..000000000 Binary files a/versioned_docs/version-old/evaluation/static/langsmith_summary.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/langsmith_summary_dark.png b/versioned_docs/version-old/evaluation/static/langsmith_summary_dark.png deleted file mode 100644 index cf44965a5..000000000 Binary files a/versioned_docs/version-old/evaluation/static/langsmith_summary_dark.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/modify_example.png b/versioned_docs/version-old/evaluation/static/modify_example.png deleted file mode 100644 index 0f034522f..000000000 Binary files a/versioned_docs/version-old/evaluation/static/modify_example.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/playground_evaluator_results.png b/versioned_docs/version-old/evaluation/static/playground_evaluator_results.png deleted file mode 100644 index a7a68f325..000000000 Binary files a/versioned_docs/version-old/evaluation/static/playground_evaluator_results.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/playground_experiment_results.png b/versioned_docs/version-old/evaluation/static/playground_experiment_results.png deleted file mode 100644 index 8ee7c986f..000000000 Binary files a/versioned_docs/version-old/evaluation/static/playground_experiment_results.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/regression_view.png b/versioned_docs/version-old/evaluation/static/regression_view.png deleted file mode 100644 index d891593bf..000000000 Binary files a/versioned_docs/version-old/evaluation/static/regression_view.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/sample_langsmith_dataset.png b/versioned_docs/version-old/evaluation/static/sample_langsmith_dataset.png deleted file mode 100644 index a741b571b..000000000 Binary files a/versioned_docs/version-old/evaluation/static/sample_langsmith_dataset.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/sample_langsmith_example.png b/versioned_docs/version-old/evaluation/static/sample_langsmith_example.png deleted file mode 100644 index 9677db78b..000000000 Binary files a/versioned_docs/version-old/evaluation/static/sample_langsmith_example.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/select_baseline.png b/versioned_docs/version-old/evaluation/static/select_baseline.png deleted file mode 100644 index 00b933762..000000000 Binary files a/versioned_docs/version-old/evaluation/static/select_baseline.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/select_columns.png b/versioned_docs/version-old/evaluation/static/select_columns.png deleted file mode 100644 index 2548198c0..000000000 Binary files a/versioned_docs/version-old/evaluation/static/select_columns.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/select_feedback.png b/versioned_docs/version-old/evaluation/static/select_feedback.png deleted file mode 100644 index fa2263ae8..000000000 Binary files a/versioned_docs/version-old/evaluation/static/select_feedback.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/switch_to_dataset.png b/versioned_docs/version-old/evaluation/static/switch_to_dataset.png deleted file mode 100644 index 06a8ce062..000000000 Binary files a/versioned_docs/version-old/evaluation/static/switch_to_dataset.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/tag_this_version.png b/versioned_docs/version-old/evaluation/static/tag_this_version.png deleted file mode 100644 index b257fcaa5..000000000 Binary files a/versioned_docs/version-old/evaluation/static/tag_this_version.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/unit-test-suite.png b/versioned_docs/version-old/evaluation/static/unit-test-suite.png deleted file mode 100644 index aa6da19e6..000000000 Binary files a/versioned_docs/version-old/evaluation/static/unit-test-suite.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/version_dataset.png b/versioned_docs/version-old/evaluation/static/version_dataset.png deleted file mode 100644 index 73ef3af6e..000000000 Binary files a/versioned_docs/version-old/evaluation/static/version_dataset.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/version_dataset_tests.png b/versioned_docs/version-old/evaluation/static/version_dataset_tests.png deleted file mode 100644 index 8f406a616..000000000 Binary files a/versioned_docs/version-old/evaluation/static/version_dataset_tests.png and /dev/null differ diff --git a/versioned_docs/version-old/evaluation/static/view_results.png b/versioned_docs/version-old/evaluation/static/view_results.png deleted file mode 100644 index 9c032b279..000000000 Binary files a/versioned_docs/version-old/evaluation/static/view_results.png and /dev/null differ diff --git a/versioned_docs/version-old/hub/dev-setup.mdx b/versioned_docs/version-old/hub/dev-setup.mdx deleted file mode 100644 index 503c03801..000000000 --- a/versioned_docs/version-old/hub/dev-setup.mdx +++ /dev/null @@ -1,44 +0,0 @@ ---- -sidebar_label: Developer Setup -sidebar_position: 2 ---- - -import { - HubInstallationCodeTabs, - HubPullCodeTabs, - HubPushCodeTabs, -} from "@site/src/components/Hub"; - -# Developer Setup - -This guide will continue from the hub quickstart, using the Python or TypeScript SDK to interact with the hub instead of the Playground UI. - -This guide assumes you've gone through the Hub [Quick Start](./quickstart) including login-required steps. - -If you don't yet have an account, you'll only be able to pull public objects. - -## 1. Install/upgrade packages - -**Note:** You likely need to upgrade even if they're already installed! - - - -## 2. Configuring environment variables - -Get an API key for your **Personal** organization if you have not yet. The hub will not work with your non-personal organization's api key! - -```bash -export LANGCHAIN_HUB_API_KEY="ls_..." -``` - -If you already have `LANGCHAIN_API_KEY` set to a personal organization’s api key from LangSmith, you can skip this. - -## 3. Pull an object from the hub and use it - - - -## 4. Push a prompt to your personal organization - -For this step, you'll need the `handle` for your account! - - diff --git a/versioned_docs/version-old/hub/faq.mdx b/versioned_docs/version-old/hub/faq.mdx deleted file mode 100644 index 7803dfd05..000000000 --- a/versioned_docs/version-old/hub/faq.mdx +++ /dev/null @@ -1,45 +0,0 @@ ---- -sidebar_label: FAQs -sidebar_position: 3 ---- - -# Frequently Asked Questions - -### What is LangChain Hub? - -[LangChain Hub](https://smith.langchain.com/hub) lets you discover, version control, and experiment with different prompts for LangChain and LLMs in general directly in your browser. - -### How do I share a private prompt with my teammates? - -You can share prompts within a LangSmith organization by uploading them within a shared organization. - -First, create an API key for your organization, then set the variable in your development environment: - -```bash -export LANGCHAIN_HUB_API_KEY = "ls__.." -``` - -Then, you can upload prompts to the organization. Assuming your organization's handle is "my-organization": - -```python -from langchain import hub - -prompt = ... -hub.push("my-organization/my-prompt-name", prompt, new_repo_is_public=False) -``` - -Now, all your team-members within your LangSmith organization will be able to view, pull, and open the prompt in the playground. - -### Why can't I push anything other than prompts? - -Hub currently only supports LangChain prompt objects. We are working on adding support for more! - -If you have a specific request, please join the `hub-feedback` [discord](https://discord.gg/6adMQxSpJS) channel and let us know! - -### Can I upload a prompt to the hub from a LangSmith Trace? - -Coming soon! - -### Can LangChain Hub do \_\_\_\_? - -Maybe, and we'd love to hear from you! Please join the `hub-feedback` [discord](https://discord.gg/6adMQxSpJS) channel diff --git a/versioned_docs/version-old/hub/quickstart.mdx b/versioned_docs/version-old/hub/quickstart.mdx deleted file mode 100644 index 69837380d..000000000 --- a/versioned_docs/version-old/hub/quickstart.mdx +++ /dev/null @@ -1,126 +0,0 @@ ---- -sidebar_label: Quick Start -sidebar_position: 1 ---- - -# Quick Start - -## What is LangChain Hub? - -<> -
-