
    Ig                       d Z ddlmZ ddlZddlmZ ddlZddlZddl	Z	ddl
Z
ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZ ddlZddlmZ ddlmZmZ ddlmZ ddlm Z! dd	l"m#Z$ dd
l%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z= erddl>Z?e?j                  Z@neZ@ ej                  eB      ZCeeDgeeD   f   ZE	 	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZF	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZG	 	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZH G d de,      ZI G d d      ZJ	 	 	 	 	 	 	 	 	 	 	 	 ddZK	 	 	 	 ddZL	 	 	 	 	 	 d dZM ed      ZN	 	 	 	 d!dZOy)"zV2 Evaluation Interface.    )annotationsN)TYPE_CHECKINGAnyAsyncIterableAsyncIterator	AwaitableCallableDictIterableListOptionalSequenceTypeVarUnioncast)run_helpers)	run_treesschemas)r   )utils)_aiter)
_warn_once)AEVALUATOR_TDATA_TEVALUATOR_TExperimentResultRow_ExperimentManagerMixin_extract_feedback_keys_ForwardResults_is_langchain_runnable_load_examples_map_load_experiment
_load_tqdm_load_traces_resolve_data_resolve_evaluators_resolve_experiment
_to_pandas_wrap_summary_evaluators)SUMMARY_EVALUATOR_TEvaluationResultEvaluationResultsRunEvaluatorAsyncExperimentResultsc                  K   |st        d       |r|rt        d| d|       t        | |||||||||	|
||       d{   S 7 w)a  Evaluate an async target system or function on a given dataset.

    Args:
        target (Union[AsyncCallable[[dict], dict], AsyncIterable[dict]]): The async target system or function to evaluate.
        data (Union[DATA_T, AsyncIterable[schemas.Example]]): The dataset to evaluate on. Can be a dataset name, a list of
            examples, an async generator of examples, or an async iterable of examples.
        evaluators (Optional[Sequence[EVALUATOR_T]]): A list of evaluators to run
            on each example. Defaults to None.
        summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): A list of summary
            evaluators to run on the entire dataset. Defaults to None.
        metadata (Optional[dict]): Metadata to attach to the experiment.
            Defaults to None.
        experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
            Defaults to None.
        description (Optional[str]): A description of the experiment.
        max_concurrency (Optional[int]): The maximum number of concurrent
            evaluations to run. Defaults to None.
        num_repetitions (int): The number of times to run the evaluation.
            Each item in the dataset will be run and evaluated this many times.
            Defaults to 1.
        client (Optional[langsmith.Client]): The LangSmith client to use.
            Defaults to None.
        blocking (bool): Whether to block until the evaluation is complete.
            Defaults to True.
        experiment (Optional[schemas.TracerSession]): An existing experiment to
            extend. If provided, experiment_prefix is ignored. For advanced
            usage only.

    Returns:
        AsyncIterator[ExperimentResultRow]: An async iterator over the experiment results.

    Environment:
        - LANGSMITH_TEST_CACHE: If set, API calls will be cached to disk to save time and
            cost during testing. Recommended to commit the cache files to your repository
            for faster CI/CD runs.
            Requires the 'langsmith[vcr]' package to be installed.

    Examples:
        >>> from typing import Sequence
        >>> from langsmith import Client, aevaluate
        >>> from langsmith.schemas import Example, Run
        >>> client = Client()
        >>> dataset = client.clone_public_dataset(
        ...     "https://smith.langchain.com/public/419dcab2-1d66-4b94-8901-0357ead390df/d"
        ... )
        >>> dataset_name = "Evaluate Examples"

        Basic usage:

        >>> def accuracy(run: Run, example: Example):
        ...     # Row-level evaluator for accuracy.
        ...     pred = run.outputs["output"]
        ...     expected = example.outputs["answer"]
        ...     return {"score": expected.lower() == pred.lower()}

        >>> def precision(runs: Sequence[Run], examples: Sequence[Example]):
        ...     # Experiment-level evaluator for precision.
        ...     # TP / (TP + FP)
        ...     predictions = [run.outputs["output"].lower() for run in runs]
        ...     expected = [example.outputs["answer"].lower() for example in examples]
        ...     # yes and no are the only possible answers
        ...     tp = sum([p == e for p, e in zip(predictions, expected) if p == "yes"])
        ...     fp = sum([p == "yes" and e == "no" for p, e in zip(predictions, expected)])
        ...     return {"score": tp / (tp + fp)}

        >>> import asyncio
        >>> async def apredict(inputs: dict) -> dict:
        ...     # This can be any async function or just an API call to your app.
        ...     await asyncio.sleep(0.1)
        ...     return {"output": "Yes"}
        >>> results = asyncio.run(
        ...     aevaluate(
        ...         apredict,
        ...         data=dataset_name,
        ...         evaluators=[accuracy],
        ...         summary_evaluators=[precision],
        ...         experiment_prefix="My Experiment",
        ...         description="Evaluate the accuracy of the model asynchronously.",
        ...         metadata={
        ...             "my-prompt-version": "abcd-1234",
        ...         },
        ...     )
        ... )  # doctest: +ELLIPSIS
        View the evaluation results for experiment:...

        Evaluating over only a subset of the examples using an async generator:

        >>> async def example_generator():
        ...     examples = client.list_examples(dataset_name=dataset_name, limit=5)
        ...     for example in examples:
        ...         yield example
        >>> results = asyncio.run(
        ...     aevaluate(
        ...         apredict,
        ...         data=example_generator(),
        ...         evaluators=[accuracy],
        ...         summary_evaluators=[precision],
        ...         experiment_prefix="My Subset Experiment",
        ...         description="Evaluate a subset of examples asynchronously.",
        ...     )
        ... )  # doctest: +ELLIPSIS
        View the evaluation results for experiment:...

        Streaming each prediction to more easily + eagerly debug.

        >>> results = asyncio.run(
        ...     aevaluate(
        ...         apredict,
        ...         data=dataset_name,
        ...         evaluators=[accuracy],
        ...         summary_evaluators=[precision],
        ...         experiment_prefix="My Streaming Experiment",
        ...         description="Streaming predictions for debugging.",
        ...         blocking=False,
        ...     )
        ... )  # doctest: +ELLIPSIS
        View the evaluation results for experiment:...

        >>> async def aenumerate(iterable):
        ...     async for elem in iterable:
        ...         print(elem)
        >>> asyncio.run(aenumerate(results))

        Running without concurrency:

        >>> results = asyncio.run(
        ...     aevaluate(
        ...         apredict,
        ...         data=dataset_name,
        ...         evaluators=[accuracy],
        ...         summary_evaluators=[precision],
        ...         experiment_prefix="My Experiment Without Concurrency",
        ...         description="This was run without concurrency.",
        ...         max_concurrency=0,
        ...     )
        ... )  # doctest: +ELLIPSIS
        View the evaluation results for experiment:...

        Using Async evaluators:

        >>> async def helpfulness(run: Run, example: Example):
        ...     # Row-level evaluator for helpfulness.
        ...     await asyncio.sleep(5)  # Replace with your LLM API call
        ...     return {"score": run.outputs["output"] == "Yes"}

        >>> results = asyncio.run(
        ...     aevaluate(
        ...         apredict,
        ...         data=dataset_name,
        ...         evaluators=[helpfulness],
        ...         summary_evaluators=[precision],
        ...         experiment_prefix="My Helpful Experiment",
        ...         description="Applying async evaluators example.",
        ...     )
        ... )  # doctest: +ELLIPSIS
        View the evaluation results for experiment:...
    z&'upload_results' parameter is in beta.zeExpected at most one of 'experiment' or 'experiment_prefix', but both were provided. Got: experiment=z, experiment_prefix=)data
evaluatorssummary_evaluatorsmetadataexperiment_prefixdescriptionmax_concurrencynum_repetitionsclientblocking
experimentupload_resultsN)r   
ValueError
_aevaluate)targetr/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   s                Z/var/www/html/answerous/venv/lib/python3.12/site-packages/langsmith/evaluation/_arunner.py	aevaluater?   I   s     Z ;<')l*>?P>QS
 	

 -+''%   s   >A AAc                 K   |xs t        j                         }t        | t        j                        r| n"t        j                  t        | |       d{   }t        j                  t        | ||       d{   }	t        j                  t        ||       d{   }
|	D cg c]  }|
|j                      }}t        |	||||||||	       d{   S 7 7 a7 @c c}w 7 w)aL  Evaluate existing experiment runs asynchronously.

    Args:
        experiment (Union[str, uuid.UUID]): The identifier of the experiment to evaluate.
        evaluators (Optional[Sequence[EVALUATOR_T]]): Optional sequence of evaluators to use for individual run evaluation.
        summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): Optional sequence of evaluators
            to apply over the entire dataset.
        metadata (Optional[dict]): Optional metadata to include in the evaluation results.
        max_concurrency (Optional[int]): Optional maximum number of concurrent evaluations.
        client (Optional[langsmith.Client]): Optional Langsmith client to use for evaluation.
        load_nested: Whether to load all child runs for the experiment.
            Default is to only load the top-level root runs.
        blocking (bool): Whether to block until evaluation is complete.

    Returns:
        AsyncIterator[ExperimentResultRow]: An async iterator over the experiment results.

    Examples:
        Define your evaluators

        >>> from typing import Sequence
        >>> from langsmith.schemas import Example, Run
        >>> def accuracy(run: Run, example: Example):
        ...     # Row-level evaluator for accuracy.
        ...     pred = run.outputs["output"]
        ...     expected = example.outputs["answer"]
        ...     return {"score": expected.lower() == pred.lower()}
        >>> def precision(runs: Sequence[Run], examples: Sequence[Example]):
        ...     # Experiment-level evaluator for precision.
        ...     # TP / (TP + FP)
        ...     predictions = [run.outputs["output"].lower() for run in runs]
        ...     expected = [example.outputs["answer"].lower() for example in examples]
        ...     # yes and no are the only possible answers
        ...     tp = sum([p == e for p, e in zip(predictions, expected) if p == "yes"])
        ...     fp = sum([p == "yes" and e == "no" for p, e in zip(predictions, expected)])
        ...     return {"score": tp / (tp + fp)}

        Load the experiment and run the evaluation.

        >>> from langsmith import aevaluate, aevaluate_existing
        >>> dataset_name = "Evaluate Examples"
        >>> async def apredict(inputs: dict) -> dict:
        ...     # This can be any async function or just an API call to your app.
        ...     await asyncio.sleep(0.1)
        ...     return {"output": "Yes"}
        >>> # First run inference on the dataset
        ... results = asyncio.run(
        ...     aevaluate(
        ...         apredict,
        ...         data=dataset_name,
        ...     )
        ... )  # doctest: +ELLIPSIS
        View the evaluation results for experiment:...

        Then evaluate the results
        >>> experiment_name = "My Experiment:64e6e91"  # Or manually specify
        >>> results = asyncio.run(
        ...     aevaluate_existing(
        ...         experiment_name,
        ...         evaluators=[accuracy],
        ...         summary_evaluators=[precision],
        ...     )
        ... )  # doctest: +ELLIPSIS
        View the evaluation results for experiment:...


    N)load_nested)r/   r0   r1   r2   r5   r7   r8   r9   )r   get_cached_client
isinstancer   TracerSession
aitertoolsaio_to_threadr!   r#   r    reference_example_idr<   )r9   r0   r1   r2   r5   r7   rA   r8   projectrunsdata_maprunr/   s                r>   aevaluate_existingrL     s     \ 4y224F j'"7"78 	,,-=z6RR 
 ))j&k D  --.@&'RRH:>?3HS--.?D?-'
 
 
 S S?
sT   AC'C$C':C;"C'CC'&C <C'C%C'C'C' C'c          
       K   t        j                  |       xs> t        | d      xr# t        j                  | j	                               xs t        |       }|	xs t        j                         }	|rd n t        t        t        j                     |       }t        j                  t        |||	       d {   \  }}t        ||	||xs |||||      j!                          d {   }t#        j$                  d       }|4|j'                          d {   }t)        j*                  |      | dz  }nd }t#        j,                  ||	j.                  g      5  |r)|j1                  t        t2        |       |       d {   }|r|j5                  ||       d {   }|r|j7                  |       d {   }t9        |      }|
r|j;                          d {    |cd d d        S 7 87 7 7 t7 Y7 @7 # 1 sw Y   y xY ww)N	__aiter__)r7   r2   r9   r4   r6   rI   r:   z.yaml)ignore_hostsr5   )asyncioiscoroutinefunctionhasattriscoroutinerN   r   rtrB   r   r   r   RunrE   rF   r&   _AsyncExperimentManagerastartls_utilsget_cache_dirget_dataset_idpathlibPathwith_optional_cacheapi_urlawith_predictions	ATARGET_Tawith_evaluatorsawith_summary_evaluatorsr-   wait)r=   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   is_async_targetrI   experiment_manager	cache_dirdsid
cache_pathresultss                        r>   r<   r<   u  s    " 	##F+ 	*FK(TW-@-@AQAQAS-T	*!&) 
 -r++-F"4Xgkk-BF(KD(66	 K ,3"3'%	 fh	G &&t,I++--\\),$u~=

		%	%j?O	P #55Y' 6  G #44O 5  G #<<=OPPG(1,,.   -	 . Q ! s   B2H4G,5/H$G/%.HG2AH'G<?G4 G<G6G<7G88$G<G:G<"H/H2H4G<6G<8G<:G<<HHc                  @    e Zd ZdZ	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZddZddZddZddZ		 d	 	 	 	 	 dd	Z
dd
	 	 	 	 	 ddZ	 	 	 	 d dZd!dZd"dZ	 d	 	 	 	 	 d#dZ	 d	 	 	 	 	 d$dZ	 	 	 	 	 	 	 	 d%dZ	 	 	 	 d&dZd'dZd(dZd)dZ xZS )*rW   aa  Manage the execution of experiments asynchronously.

    Supports lazily running predictions and evaluations in parallel to facilitate
    result streaming and early debugging.

    Args:
        data (DATA_T): The data used for the experiment. Can be a dataset name or ID OR
            a generator of examples.
        runs (Optional[Iterable[schemas.Run]]): The runs associated with the experiment
            predictions.
        experiment (Optional[schemas.TracerSession]): The tracer session
            associated with the experiment.
        experiment_prefix (Optional[str]): The prefix for the experiment name.
        description (Optional[str]): The description for the experiment.
        metadata (Optional[dict]): Additional metadata for the experiment.
        client (Optional[langsmith.Client]): The Langsmith client used for
             the experiment.
        evaluation_results (Optional[Iterable[EvaluationResults]]): The evaluation
            sresults for the experiment.
        summary_results (Optional[Iterable[EvaluationResults]]): The aggregate results
            for the experiment.
    Nc                   t         |   ||||       || _        d | _        |t	        j
                  |      nd | _        || _        || _        |	| _	        |
| _
        y )N)r9   r2   r7   r4   )super__init___data	_examplesrE   ensure_async_iterator_runs_evaluation_results_summary_results_num_repetitions_upload_results)selfr/   r9   r2   rI   r7   evaluation_resultssummary_resultsr4   r6   r:   	__class__s              r>   ro   z _AsyncExperimentManager.__init__  sq     	!#	 	 	
 
CG6:6FJ,,T2D 	
 $6  / /-    c                  K   | j                   mt        | j                  | j                        | _         | j                  dkD  r8t        t        j                  | j                   | j                              | _         t        j                  t        j                  | j                         dt        j                               \  | _         }|S w)Nr7         lock)rq   _aresolve_datarp   r7   rv   async_chain_from_iterablerE   ateerr   rQ   Lock)rx   examples_iters     r>   aget_examplesz%_AsyncExperimentManager.aget_examples  s     >>!+DJJt{{KDN$$q(!:OODNND4I4IJ" )3,,T^^<aglln)
% s   CCc                @  K   | j                   t        | j                   dd       sUt        j                  | j	                          d {          d {   }|t        d      t        |j                        S t        | j                   j                        S 7 Q7 Gw)Nreference_dataset_idz!No examples found in the dataset.)	_experimentgetattrrE   py_anextr   r;   str
dataset_idr   )rx   examples     r>   r[   z&_AsyncExperimentManager.get_dataset_id  s     #74d,
 '//d6H6H6J0JKKG !DEEw))**4##8899	 1KKs%   ABB	BBABBc                  K   | j                   t        d      t        j                  t        j                  | j                         dt        j                               \  | _         }|2 3 d {   }| 7 
6 y w)NzRuns not loaded yet.r   r   )rs   r;   rE   r   rr   rQ   r   )rx   rI   rK   s      r>   	aget_runsz!_AsyncExperimentManager.aget_runs   sk     ::344%??,,TZZ8!',,.

D  	 	#I	s*   A)A>+A</A:0A<3A>:A<<A>c               H  K   | j                   )| j                          d {   2 3 d {   }dg i t        j                  t        j                  | j                         dt        j                               \  | _         }|2 3 d {   }| 7 u7 n6 y 7 6 y w)Nrk   r   r   )rt   r   rE   r   rr   rQ   r   )rx   _ry   results       r>   aget_evaluation_resultsz/_AsyncExperimentManager.aget_evaluation_results	  s     ##+!%!3!3!55 & &a "o%;E??001I1IJ\\^<8D$&8
 !3  f 6 &5 2sT    B"BB"BBBAB"	B BB B"BB"B  B"c           	       K   	 t        j                  | j                          d {          d {   }|st	        d      | j
                  r| j                  |      nd }| j                  ||       | j                  | j                  d<   | j                  | j                          d {   || j                  | j                  | j                  | j                  | j
                        S 7 7 # t        $ r t	        d      w xY w7 ]w)Nz\No examples found in the dataset. Please ensure the data provided to aevaluate is not empty.z[No examples found in the dataset.Please ensure the data provided to aevaluate is not empty.r6   )r9   r2   r7   rI   ry   r:   )rE   r   r   StopAsyncIterationr;   rw   _get_project_print_experiment_startrv   	_metadatar{   r7   rs   rt   )rx   first_examplerH   s      r>   rX   z_AsyncExperimentManager.astart  s    	","5"5D<N<N<P6P"QQM M  7;6J6J$##M2PT$$Wm<,0,A,A()~~$$&&^^;;#77//  
 	
 7QQ! 	M 	 'sI   D"C2 C.C2 C0C2 A5D,D
-AD.C2 0C2 2DDc                 K   | j                  ||      }t        j                  |dt        j                               \  }}t        d |2        | j                  | j                  | j                  d |2        | j                        S w)NrP   r   r   c               8   K   | 3 d {   }|d    7 6 y wNr    .0preds     r>   	<genexpr>z<_AsyncExperimentManager.awith_predictions.<locals>.<genexpr>9  s     22tT)_22   
c               8   K   | 3 d {   }|d    7 6 y wNrK   r   r   s     r>   r   z<_AsyncExperimentManager.awith_predictions.<locals>.<genexpr>=  s     33$u+33r   )r9   r2   r7   rI   r:   )
	_apredictrE   r   rQ   r   rW   r   r   r7   rw   )rx   r=   r5   _experiment_resultsr1r2s         r>   r`   z)_AsyncExperimentManager.awith_predictions0  su      #nnV_nU!4agllnMB&2r2''^^;;33//
 	
s   BB
rP   c          
     V  K   t        |      }| j                  ||      }t        j                  |dt	        j
                               \  }}}t        d |2        | j                  | j                  | j                  d |2        d |2        | j                  | j                        S w)NrP      r   c               8   K   | 3 d {   }|d    7 6 y wr   r   r   r   s     r>   r   z;_AsyncExperimentManager.awith_evaluators.<locals>.<genexpr>K  s     66VI66r   c               8   K   | 3 d {   }|d    7 6 y wr   r   r   s     r>   r   z;_AsyncExperimentManager.awith_evaluators.<locals>.<genexpr>O  s     77&&-77r   c               8   K   | 3 d {   }|d    7 6 y w)Nry   r   r   s     r>   r   z;_AsyncExperimentManager.awith_evaluators.<locals>.<genexpr>P  s     TTv'; <TTr   r9   r2   r7   rI   ry   rz   r:   )r%   _ascorerE   r   rQ   r   rW   r   r   r7   ru   rw   )rx   r0   r5   experiment_resultsr   r   r3s          r>   rb   z(_AsyncExperimentManager.awith_evaluatorsA  s      )4
!\\*o\V__%7P
B&626''^^;;7B7TQST 11//	
 		
s   B'B)c           
       K   t        |      }| j                  |      }t        | j                          d {   | j                  | j
                  | j                  | j                         | j                  || j                        S 7 Pw)Nr   )
r(   _aapply_summary_evaluatorsrW   r   r   r   r7   r   rt   rw   )rx   r1   wrapped_evaluatorsaggregate_feedback_gens       r>   rc   z0_AsyncExperimentManager.awith_summary_evaluatorsU  s{      66HI!%!@!@AS!T&$$&&''^^;;!#772//	
 		
&s   5B
BAB
c                  K   t        j                  | j                         | j                          d {   | j	                               2 3 d {   \  }}}t        |||       7 67 6 y w)NrK   r   ry   )rE   	async_zipr   r   r   r   )rx   rK   r   ry   s       r>   aget_resultsz$_AsyncExperimentManager.aget_resultsf  st     6@6J6JNND$6$6$88$:V:V:X7
 	 	2#w 2 &#5  9	 7
s3   2A1A+
A1A/A-A/A1-A//A1c                   K   | j                   dg iS d| j                   2 cg c3 d {   }|d   D ]  }| 7 6 nc c}}w c}}iS w)Nrk   )ru   )rx   rk   ress      r>   aget_summary_scoresz+_AsyncExperimentManager.aget_summary_scoresp  so       (r?"%)%:%: !"9-    
 	
s&   !A
?=
;=
?=
?A
c                  K   t        |       fd}t        j                  | |       d      2 3 d {   }| 7 
6  j                          d {  7   y w)Nc                   K   j                          d {   2 3 d {   } t        | j                  j                  j                         :7 >7 76 y wN)r   	_aforwardexperiment_namer   r7   )r   fnrx   s    r>   predict_allz6_AsyncExperimentManager._apredict.<locals>.predict_all  sW     '+'9'9';!;  g!5!5t~~t{{  "< !;s/   AAAAAA3AAAMbP?_eager_consumption_timeout)_ensure_async_traceablerE   aiter_with_concurrency_aend)rx   r=   r5   r   r   r   s   `    @r>   r   z!_AsyncExperimentManager._apredict}  s`      %V,	 '==[]u
 	 	& L	 

 jjls1   /AAAAAAAAAc                   K   t        j                  d      5  fd}t        j                  | |       d      2 3 d {   }| 7 
6 	 d d d        y # 1 sw Y   y xY ww)N   )max_workersc                v   K   j                         2 3 d {   } j                  |        !7 6 y w)N)executor)r   _arun_evaluators)current_resultsr0   r   rx   s    r>   	score_allz2_AsyncExperimentManager._ascore.<locals>.score_all  sG     -1->->-@  ///"Oh 0  -@s   9757979r   r   )cfThreadPoolExecutorrE   r   )rx   r0   r5   r   r   r   s   ``   @r>   r   z_AsyncExperimentManager._ascore  si     
 ""q1 	X !+ A A!  f  !	 	 	s@   A)%AAAA	AAA	A)A&"A)c                  K   t        j                         }i |d   xs i d| j                  i}t        j                  di i |d|| j                  sdnd| j
                  d5  |d   }|d   }|d	   }|D ]u  }		 |	j                  ||
       d {   }
|d   j                  | j
                  j                  |
             | j                  r| j
                  j                  |
||       w t'        |||      cd d d        S 7 t# t        $ rW}	 t        |	      }t        |D cg c]&  }t        ||j                  t        |      ddi      ( nc c}w c}      }|d   j                  | j
                  j                  |             | j                  r| j
                  j                  |||       n.# t        $ r"}t         j#                  d|        Y d }~nd }~ww xY wt         j%                  dt        |	       d|j                   dt        |       d       t         j%                  dt        |	       d|j                   dt        |       d       Y d }~d }~ww xY w# 1 sw Y   y xY ww)Nr2   r9   r0   localTproject_namer2   enabledr7   rK   r   ry   rK   r   rk   )rK   	_executorerror)keysource_run_idcommentextra)rk   zError parsing feedback keys: zError running evaluator z on run : exc_infor   r   )rhget_tracing_contextr   tracing_contextrw   r7   aevaluate_runextend_select_eval_results_log_evaluation_feedback	Exceptionr   r+   r*   idreprloggerdebugr   r   )rx   r0   r   r   current_contextr2   rK   r   eval_results	evaluatorevaluator_responseefeedback_keysr   error_responsee2s                   r>   r   z(_AsyncExperimentManager._arun_evaluators  s     002
z*0b
T112
  
! ,$*.*>*>7D++
 @	 "%(C%i0G*+?@L' /	./8/F/F ' 0G 0 *& !+22889KL ++<<.C8 = /` '#/y@	 @	* ! "(>y(I): ,9% %( !1(+25&&,0G+2D/	!"% %
* %Y/66 KK<<^L  // KK@@ .C8 A  % 'DRD%IJ LL24	?2C D  #xr$q'4!% ! 
 LL24	?2C D  #xr$q'4!% !  ="3@	 @	s   A$I9&I-<D	DAD	.I-=
I9D		
I*F9(+EA%F98I%9	G$	G	I%G$	$A;I%I-%I**I--I62I9c               ^  K   g g }}t        j                  | j                          d {         }t        j                  | j	                         |      2 3 d {   \  }}|j                  |       |j                  |       /7 [7 ,6 g }| j                  r| j                         j                  nd }t        j                         }	i |	d   xs i | j                  |d}
t        j                  di i |	d|
| j                  sdnd| j                  d5  |D ]  }	  |||      }| j                  j                  ||j                        }|j!                  |       | j                  re|D ]`  }|j#                  dh	      }|j%                  d
d       }t        j&                  | j                  j(                  fi |d ||d d {  7   b # t*        $ r0}t,        j/                  dt1        |       d| d       Y d }~d }~ww xY w 	 d d d        n# 1 sw Y   nxY wd|i y w)Nr2   )r9   experiment_idr0   r   Tr   )fn_nametarget_run_id)excludeevaluator_info)run_id
project_idsource_infoz Error running summary evaluator r   r   rk   r   )rE   rr   r   r   r   appendrw   _get_experimentr   r   r   r   r   r7   r   __name__r   dictpoprF   create_feedbackr   r   r   r   )rx   r1   rI   examplesasync_examplesrK   r   aggregate_feedbackr  r   r2   r   summary_eval_resultflattened_resultsr   feedbackr   r   s                     r>   r   z2_AsyncExperimentManager._aapply_summary_evaluators  sa     Rh#99@R@R@T:TU","6"6NNn#
 	% 	%,#w KKOOG$ ;U	% #

  262F2FT))+..D
002
z*0b
 #22!+
  
! ,$*.*>*>7D++
  	 0 	*3D(*C'(,(H(H+ ) 2 2 )I )% '--.?@++&7 	F'-{{O;L{'MH-5\\:JD-QN",":": $ ; ;#"*# (,+5,:#  	 ! LL:4	?:K2aSQ!% !  ' 	  	  	B ,--s   'H-B,H-B	BB	(H-B		BH-HB*G	G

GH	H&HHHH	H-H"H-c                   K   g }| j                          d {   2 3 d {   }|j                  s|j                  |j                         27 67 /6 |rt        |      nd }|r|j	                         S d S wr   )r   modified_atr  max	isoformat)rx   r  r   max_modified_ats       r>   _get_dataset_versionz,_AsyncExperimentManager._get_dataset_version'  sz     #'#5#5#77 	8 	8'"" ""7#6#67	 8 	87 /:#k*t.=((*G4Gs4   A7AA7AAAA7A7A$A7c                  K   t               }| j                          d {   2 3 d {   }|j                  ro|j                  j                  d      rTt	        |j                  d   t
              r7|j                  d   D ]$  }t	        |t              s|j                  |       & |j                  d       7 7 6 t        |      S w)Ndataset_splitbase)setr   r2   getrC   listr   add)rx   splitsr   splits       r>   _get_dataset_splitsz+_AsyncExperimentManager._get_dataset_splits2  s     #'#5#5#77 
	# 
	#'  $$((9w//@$G$--o> *E!%-

5)* 

6" 8 
	#7 F|s6   CB;CB?B=B?A'C'C=B??Cc                  K   | j                   sy | j                  }|t        d      | j                         }| j	                          d {   |d<   | j                          d {   |d<   | j                  j                  |j                  |j                  xs7 t        j                  j                  t        j                  j                        i |j                  |       y 7 7 w)NzExperiment not started yet.dataset_versiondataset_splits)end_timer2   )rw   r   r;   _get_experiment_metadatar  r  r7   update_projectr   r#  datetimenowtimezoneutcr2   )rx   r9   project_metadatas      r>   r   z_AsyncExperimentManager._aendB  s     ##%%
:;;88:484M4M4O.O*+373K3K3M-M)*""MM(( <  $$X%6%6%:%:;%%"	 	# 	
 /P-Ms%   A
C-C)C-'C+(BC-+C-)	NNNNNNNr   T)r9   z+Optional[Union[schemas.TracerSession, str]]r2   Optional[dict]rI   zBOptional[Union[Iterable[schemas.Run], AsyncIterable[schemas.Run]]]r7   Optional[langsmith.Client]ry   *Optional[AsyncIterable[EvaluationResults]]rz   r-  r4   Optional[str]r6   intr:   boolr/   -Union[DATA_T, AsyncIterable[schemas.Example]])returnAsyncIterator[schemas.Example]r2  r   )r2  zAsyncIterator[schemas.Run])r2   AsyncIterator[EvaluationResults])r2  rW   r   )r5   Optional[int]r=   ra   r2  rW   )r0   z*Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]r5   r6  r2  rW   )r1   Sequence[SUMMARY_EVALUATOR_T]r2  rW   r2  "AsyncIterator[ExperimentResultRow])r2  zDict[str, List[dict]])r5   r6  r=   ra   r2  zAsyncIterator[_ForwardResults])r0   Sequence[RunEvaluator]r5   r6  r2  r9  )r0   r:  r   r   r   zcf.ThreadPoolExecutorr2  r   )r1   r7  r2  r5  )r2  r.  )r2  zOptional[list[str]]r2  None)r  
__module____qualname____doc__ro   r   r[   r   r   rX   r`   rb   rc   r   r   r   r   r   r   r  r  r   __classcell__)r{   s   @r>   rW   rW     s   6 CG#'SW-1IMFJ%) #. @	.
 !. Q. +. G. D. #. . .;.<:
< *.	
 '	


 
!
* *.	
>
 '	

 
!
(
9
 
!
"	
 FJ5B	', *.* ' 
,	&K*K -K (	K
 
KZ5."?5.	)5.n	H 
r|   rW   c                  x    e Zd Z	 	 ddZedd       ZddZddZddZ	 d	 	 	 	 	 ddZ	ddZ
dd	Zdd
ZddZy)r-   c                    || _         g | _        t        j                         | _        t        j
                  | j                  | j                               | _        d| _        y )Nr   )	_manager_resultsrQ   r   _lockcreate_task_process_data_task_processed_count)rx   experiment_managers     r>   ro   zAsyncExperimentResults.__init__X  sH     +35\\^
((););DMM)JK
 !r|   c                .    | j                   j                  S r   )rC  r   rx   s    r>   r   z&AsyncExperimentResults.experiment_nameb  s    }},,,r|   c                    | S r   r   rL  s    r>   rN   z AsyncExperimentResults.__aiter__f  s    r|   c           	     <   K   d fd}	  j                   4 d {     j                  t         j                        k  rA j                   j                     } xj                  dz  c_        |cd d d       d {    S  j                  j                         rt        d d d       d {    t        j                  t        j                   |t         j                              d              d {    7 7 7 T# 1 d {  7  sw Y   dxY w7  w)Nc                   K   j                   | k  r.t        j                  d       d {    j                   | k  r-y y 7 w)Ng?)rI  rQ   sleep)indexrx   s    r>   _wait_until_indexz;AsyncExperimentResults.__anext__.<locals>._wait_until_indexj  s:     ''%/mmD))) ''%/)s   (AAA Ar   )timeout)rQ  r/  r2  r<  )
rE  rI  lenrD  rH  doner   rQ   shieldwait_for)rx   rR  r   s   `  r>   	__anext__z AsyncExperimentResults.__anext__i  s     	* zz - -((3t}}+==!]]4+@+@AF))Q.)!	- - -
 ZZ__&,,- - ..  !23t}}3E!FPTU   - - - - - -sp   DC?DAD1D=D>D D#D.D/A
D9D:DDDDDDDc                  K   t               } ||j                               2 3 d {   }| j                  4 d {    | j                  j	                  |       d d d       d {    J7 E7 27 	# 1 d {  7  sw Y   bxY w6 |j                          d {  7  }| j                  4 d {  7   || _        d d d       d {  7   y # 1 d {  7  sw Y   y xY wwr   )r"   r   rE  rD  r  r   ru   )rx   rg   tqdmitemsummary_scoress        r>   rG  z$AsyncExperimentResults._process_data{  s     |w3356 	+ 	+$zz + +$$T*+ + +	++ + + + + 7  '::<<<:: 	3 	3$2D!	3 	3 	3 	3 	3s   !C&BA-BC&A/C& A3C&'A1(C&-B/C&1C&3B	9A<:B	C&BC&2B53C&7C?C&
CC&C#CC#C&Nc                2    t        | j                  ||      S )N)startend)r'   rD  )rx   r^  r_  s      r>   	to_pandasz AsyncExperimentResults.to_pandas  s     $--u#>>r|   c                    dd l }| j                  r=|j                  j                  d      r"| j	                  dd      }|j                         S | j                         S )Nr   pandas   )importlib.utilrD  util	find_specr`  _repr_html___repr__)rx   	importlibdfs      r>   rg  z"AsyncExperimentResults._repr_html_  sF    ==Y^^55h?1%B>>##==?"r|   c                ,    t        | j                        S r   )rT  rD  rL  s    r>   __len__zAsyncExperimentResults.__len__  s    4==!!r|   c                "    d| j                    dS )Nz<AsyncExperimentResults >)r   rL  s    r>   rh  zAsyncExperimentResults.__repr__  s    )$*>*>)?qAAr|   c                8   K   | j                    d {    y 7 wr   )rH  rL  s    r>   rd   zAsyncExperimentResults.wait  s     jjs   )rJ  rW   r4  r8  )r2  r   )rg   rW   r2  r<  )r   N)r^  r6  r_  r6  r2  	DataFrame)r2  r/  r;  )r  r=  r>  ro   propertyr   rN   rX  rG  r`  rg  rl  rh  rd   r   r|   r>   r-   r-   W  sj    "3" - -$3 >B?"?-:?	?
#"Br|   c                *  K   d dfd}t        j                  d      5  	  | |j                  t        j                  |j                  ||i |d|j
                  r|j
                  j                         n|j                  j                         i|             d {    t        t        t        j                        |
      cd d d        S 7 2# t        $ r%}t        j                  d| dd	       Y d }~Yd }~ww xY w# 1 sw Y   y xY ww)Nc                
    | y r   r   )rrK   s    r>   _get_runz_aforward.<locals>._get_run  s    r|   T)r   example_version)rG   on_endr   r2   r7   )langsmith_extrazError running target function: r   )r   
stacklevelr   )rt  zrun_trees.RunTreer2  r<  )r   r   inputsLangSmithExtrar   r  r  
created_atr   r   r   r   r   r   rV   )r   r   r   r2   r7   ru  r   rK   s          @r>   r   r     s     &*C 
		D	) 
	 " 1 1)0#!0")&22 $//99;!(!3!3!=!=!? "!  * W[[#&
/
 
"  	LL1!5QR   	'
 
sX   DDA=C!C"C&$D

DC	DC?:D?DDDDc                   t        j                  |       s,t        |       s!t        |       rt	        d      t	        d      t        j                  |       r| S t        |       r| j                  }  t        j                  d      |       S )NzTarget must be an async function. For sync functions, use evaluate. Example usage:

async def predict(inputs: dict) -> dict:
    # do work, like chain.invoke(inputs)
    return {...}
await aevaluate(predict, ...)zTarget must be a callable async function. Received a non-callable object. Example usage:

async def predict(inputs: dict) -> dict:
    # do work, like chain.invoke(inputs)
    return {...}
await aevaluate(predict, ...)AsyncTarget)name)	rQ   rR   r   callabler;   r   is_traceable_functionainvoke	traceable)r=   s    r>   r   r     s     &&v.7Mf7UF0  0  
'!&)^^F/r||/77r|   c                   t        | t              rt        j                  |       S t        j                  t	        | |            S )z*Return the examples for the given dataset.r~   )rC   r   rE   rr   r$   )r/   r7   s     r>   r   r     s7     $&//55++M$v,NOOr|   Tc               B   K   | D ]  }|2 3 d{   }|  y7 6 w)zChain multiple async iterables.Nr   )iterablesub_iterabler[  s      r>   r   r     s3      ! & 	 	$J	,s   	)NNNNNNr   NTNT)r/   zHUnion[DATA_T, AsyncIterable[schemas.Example], Iterable[schemas.Example]]r0   4Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]]r1   'Optional[Sequence[SUMMARY_EVALUATOR_T]]r2   r+  r3   r.  r4   r.  r5   r6  r6   r/  r7   r,  r8   r0  r9   6Optional[Union[schemas.TracerSession, str, uuid.UUID]]r:   r0  r=   z%Union[ATARGET_T, AsyncIterable[dict]]r2  r-   )NNNNNFT)r0   r  r1   r  r2   r+  r5   r6  r7   r,  rA   r0  r8   r0  r9   z,Union[str, uuid.UUID, schemas.TracerSession]r2  r9  )r/   r1  r0   r  r1   r  r2   r+  r3   r.  r4   r.  r5   r6  r6   r/  r7   r,  r8   r0  r9   r  r:   r0  r=   z<Union[ATARGET_T, AsyncIterable[dict], Iterable[schemas.Run]]r2  r-   )r   ,rh.SupportsLangsmithExtra[[dict], Awaitable]r   zschemas.Exampler   r   r2   r  r7   langsmith.Clientr2  r   )r=   ra   r2  r  )r/   r1  r7   r  r2  r3  )r  zIterable[AsyncIterable[T]]r2  zAsyncIterator[T])Pr?  
__future__r   rQ   concurrent.futuresfuturesr   r&  loggingr\   uuidtypingr   r   r   r   r   r	   r
   r   r   r   r   r   r   r   	langsmithr   r   r   r   rU   r   rY   langsmith._internalr   rE   #langsmith._internal._beta_decoratorr   langsmith.evaluation._runnerr   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   langsmith.evaluation.evaluatorr)   r*   r+   r,   rb  pdrp  	getLoggerr  r   r  ra   r?   rL   r<   rW   r-   r   r   r   r  r   r   r|   r>   <module>r     s\    "         "  ' ( % ' 4 :    &  II			8	$dVYt_,-	 HLBF#'+!%%))-IMC SC E	C
 @C C %C C #C C 'C C GC C1C CR HLBF#%))-c Ec @	c
 c #c 'c c c<c (cT HLBF#'+!%%))-IM; 8; E	;
 @; ; %; ; #; ; '; ; G; ;H; ;|a
5 a
HB BJ'
4'
'
 '
 	'

 '
 '
T8818<P
7PDTP#P CL(r|   