
    'XhH                        d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z%  G d de#      Z& G d d      Z'y)    N)defaultdict)deepcopy)get_all_start_methods)OptionalUnionIterableAnyTypeget_args)	BaseModel)models)INFERENCE_OBJECT_TYPES)InspectorEmbed)Embedder)NumericVectorNumericVectorStruct)ModelSchemaParser)	FieldPath)FastEmbedMisc)ParallelWorkerPoolWorker)
iter_batchc                   j    e Zd ZdedefdZedededd fd       Zdee	eef      dee	eef      fdZ
y)	ModelEmbedderWorker
batch_sizekwargsc                 2    t        di || _        || _        y )N )ModelEmbeddermodel_embedderr   )selfr   r   s      L/RAG/venv/lib/python3.12/site-packages/qdrant_client/embed/model_embedder.py__init__zModelEmbedderWorker.__init__   s    +5f5$    returnc                      | dd|d|S )N   )threadsr   r   r   )clsr   r   s      r"   startzModelEmbedderWorker.start   s    >1>v>>r$   itemsc              #      K   |D ]9  \  }}|t        | j                  j                  || j                              f ; y w)Ninference_batch_size)listr    embed_models_batchr   )r!   r+   idxbatchs       r"   processzModelEmbedderWorker.process   sK     JC''::DOO ;    s   A AN)__name__
__module____qualname__intr	   r#   classmethodr*   r   tupler3   r   r$   r"   r   r      sl    %3 %# % ?s ?c ?6K ? ?	XeCHo6 	8E#s(O;T 	r$   r   c                   <   e Zd ZdZddee   defdZ	 	 ddee	e
e	   f   deded	e
e	   fd
Z	 	 dde
eeee	f   e	f      dedee   d	e
eeee	f   e	f      fdZ	 	 ddeeeee	f   e	f      deded	e
e	   fdZ	 	 	 	 ddeeee	f   e	f   deee      dededee   d	eeee	f   eeef   e	ef   fdZdej.                  d	dfdZ	 ddej.                  deded	efdZddeded	dfdZded	efdZedej.                  d	ej.                  fd       Zed	e e!   fd       Z"y) r   @   Nparserr   c                 b    i | _         i | _        t        |      | _        t	        di || _        y )N)r<   r   )_batch_accumulator_embed_storager   _embed_inspectorr   embedder)r!   r<   r   s      r"   r#   zModelEmbedder.__init__-   s.    KM>@ .f = *6*r$   
raw_modelsis_queryr   r%   c              #      K   t        |t              r|g}t        ||      D ]  }| j                  |||      E d{      y7 w)a2  Embed raw data fields in models and return models with vectors

            If any of model fields required inference, a deepcopy of a model with computed embeddings is returned,
            otherwise returns original models.
        Args:
            raw_models: Iterable[BaseModel] - models which can contain fields with raw data
            is_query: bool - flag to determine which embed method to use. Defaults to False.
            batch_size: int - batch size for inference
        Returns:
            list[BaseModel]: models with embedded fields
        r-   N)
isinstancer   r   r0   )r!   rB   rC   r   raw_models_batchs        r"   embed_modelszModelEmbedder.embed_models3   sU     " j),$J *:z B.. ( /    !Cs   :AAAparallelc              #     K   d}t        |t              rt        |      |k  rd}||dk(  s|r-t        ||      D ]  }| j	                  ||      E d{     yd}t        ||      }|dk(  rt        j                         }dt               v rdnd	}|J t        || j                         || j                  
      }	|	j                  ||      D ]  }|E d{     y7 7 	w)a1  Embed raw data fields in models and return models with vectors

        Requires every input sequences element to contain raw data fields to inference.
        Does not accept ready vectors.

        Args:
            raw_models: Iterable[BaseModel] - models which contain fields with raw data to inference
            batch_size: int - batch size for inference
            parallel: int - number of parallel processes to use. Defaults to None.

        Returns:
            Iterable[Union[dict[str, BaseModel], BaseModel]]: models with embedded fields
        FTNr'   r-   )sizer   
forkserverspawn)num_workersworkerstart_methodmax_internal_batch_size)r   )rE   r/   lenr   r0   os	cpu_countr   r   _get_worker_classMAX_INTERNAL_BATCH_SIZEordered_map)
r!   rB   r   rH   is_smallr2   multiprocessing_batch_sizeraw_models_batchesrO   pools
             r"   embed_models_strictz!ModelEmbedder.embed_models_strictK   s    & j$':+x1}#J
;225z2ZZZ < *+&!+J=W!X1}<<>+7;P;R+R<X_L'''%$--/)(,(D(D	D ))"/I *  !  # [( !s%   AC%C!BC%C#C%#C%r.   c              #       K   |D ]  } j                  |d         j                  s|E d{    y fd|D        E d{    y7 7 w)af  Embed a batch of models with raw data fields and return models with vectors

            If any of model fields required inference, a deepcopy of a model with computed embeddings is returned,
            otherwise returns original models.
        Args:
            raw_models: list[Union[dict[str, BaseModel], BaseModel]] - models which can contain fields with raw data
            is_query: bool - flag to determine which embed method to use. Defaults to False.
            inference_batch_size: int - batch size for inference
        Returns:
            Iterable[BaseModel]: models with embedded fields
        T)rC   accumulatingNc              3   H   K   | ]  }j                  |d         yw)FrC   r]   r.   N)_process_model).0	raw_modelr.   rC   r!   s     r"   	<genexpr>z3ModelEmbedder.embed_models_batch.<locals>.<genexpr>   s9       ",I ##%!&)=	 $  ",s   ")r`   r>   )r!   rB   rC   r.   rb   s   ` `` r"   r0   z ModelEmbedder.embed_models_batch|   s`     " $I	H4P $ &&!!! ",   "s!   -AAAA	AAmodelpathsr]   c           
         t        |t        t                    r1|r| j                  |       n|J d       | j	                  |||      S ||st        |      n|}t        |t              rJ|j                         D ]5  \  }}|r| j                  ||d       | j                  |||d|      ||<   7 |S ||n| j                  j                  |      }|D ]  }t        |t              s|gn|}	|	D ]  }
t        |
|j                  d      }||j                  r!| j                  ||j                  |||       Jt        |t              }|r|n|g}|s_|J d       |D cg c]  }| j	                  |||       }}|rt        |
|j                  |       t        |
|j                  |d          |D ]  }| j                  |          |S c c}w )	ab  Embed model's fields requiring inference

        Args:
            model: Qdrant http model containing fields to embed
            paths: Path to fields to embed. E.g. [FieldPath(current="recommend", tail=[FieldPath(current="negative", tail=None)])]
            is_query: Flag to determine which embed method to use. Defaults to False.
            accumulating: Flag to determine if we are accumulating models for batch embedding. Defaults to False.
            inference_batch_size: Optional[int] - batch size for inference

        Returns:
            A deepcopy of the method with embedded fields
        Nz3inference_batch_size should be passed for inferencerC   r.   T)r]   Fr_   r   )rE   r   r   _accumulate_drain_accumulatorr   dictr+   r`   r@   inspectr/   getattrcurrenttailsetattr)r!   rd   re   rC   r]   r.   keyvaluepath
list_modelitemcurrent_modelwas_listdata
embeddingss                  r"   r`   zModelEmbedder._process_model   s$   * eX&<=>  ' )4IHI4..%)= /   =+7HUOUEeT"#kkm
U''u4'H!%!4!4!)%*-A "5 "E#J	 , L*0E0E0M0Me0TD(25$(?%UJ" 'dllD A (99''%		!)%1-A (   *->H5=MM?M'0<QPQ< )6	& )6 !33 $xNb 4  )6	 # & $#D$,,
C#D$,,
1F$1D ,,T2 %2? # F &s   !Grw   c                     t        |t              r'|j                         D ]  }| j                  |        yt        |t              r3|D ].  }t        |t        t                    s y| j                  |       0 t        |t        t                    sy| j                  |      }|j                  | j                  vrg | j                  |j                  <   | j                  |j                     j                  |       y)aH  Add data to batch accumulator

        Args:
            data: models.VectorStruct - any vector struct data, if inference object types instances in `data` - add them
                to the accumulator, otherwise - do nothing. `InferenceObject` instances are converted to proper types.

        Returns:
            None
        N)rE   rj   valuesrh   r/   r   r   _resolve_inference_objectrd   r>   append)r!   rw   rq   s      r"   rh   zModelEmbedder._accumulate   s     dD!  ' 'dD!!%2H)IJ  ' 
 $)? @A--d3::T44424D##DJJ/

+2248r$   c                 ,   t        |t              r1|j                         D ]  \  }}| j                  |||      ||<    |S t        |t              rIt        |      D ]9  \  }}t        |t        t                    s|c S | j                  |||      ||<   ; |S t        |t        t                    s|S | j                  r&| j                  j                  |j                  d      s| j                  ||       | j                  |j                        S )at  Drain accumulator and replaces inference objects with computed embeddings
            It is assumed objects are traversed in the same order as they were added to the accumulator

        Args:
            data: models.VectorStruct - any vector struct data, if inference object types instances in `data` - replace
                them with computed embeddings. If embeddings haven't yet been computed - compute them and then replace
                inference objects.
            inference_batch_size: int - batch size for inference

        Returns:
            NumericVectorStruct: data with replaced inference objects
        rg   N)rE   rj   r+   ri   r/   	enumerater   r   r?   getrd   _embed_accumulator_next_embed)r!   rw   rC   r.   rp   rq   is          r"   ri   z ModelEmbedder._drain_accumulator  s    dD!"jjl
U 33HCW 4 S	 + KdD!%dO5!%2H)IJK11HCW 2 Q	 , K(12
 K""$*=*=*A*A$**d*S##XL`#a

++r$   c           
      &    dt         t           dt        dt        dt         t           f fd} j
                  D ]  }t        t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      f      rst        | d        j
                  j                         D ]  \  }} ||||       j                  |<     j
                  j!                          y)	a  Embed all accumulated objects for all models

        Args:
            is_query: bool - flag to determine which embed method to use. Defaults to False.
            inference_batch_size: int - batch size for inference
        Returns:
            None
        objects
model_namer   r%   c                    g }g }g }t        t              }t        |       D ]  \  }}t        |t        j
                        }	t        t        ||            D ]_  \  }
\  }}||j                  k(  s|	|k(  s||
   j                  |       ||
   j                  |	r|j                  n|j                           |g|t        |      <   |j                  |j                         |j                  |	       |j                  |	r|j                  n|j                  g        g }t        t        ||            D ]X  \  }\  }}	|j                  j                  j                  ||	r||   nd|	s||   nd|xs i |      D cg c]  }| c}       Z t        |      }g gt        |       z  }|j!                         D ]  }|D ]  }t#        |      ||<     |S c c}w )z
            Assemble batches by options and data type based groups, embeds and return embeddings in the original order
            N)r   textsimagesrC   optionsr   )r   r/   r~   rE   r   Documentzipr   r|   textimagerQ   extendrA   embediterrz   next)r   r   r   unique_optionsunique_options_is_textbatchesgroup_indicesr   objis_textjr   options_is_textrx   	embeddingiter_embeddingsordered_embeddingsindicesindexrC   r!   s                      r"   r   z/ModelEmbedder._embed_accumulator.<locals>.embedI  s    46N13"!#G2=d2CM#G,3$S&//:5>(>?61A1 #++-'_2L%a(//2
))g#((399M6 ;<M#n"56"))#++6*11':NNCHHSYY#GH -  J)23~G]3^)_%%GW!! *.)<)<'107'!*T5<71:$%-$+Mr'1 *= *
*I "*
 *` #:.O=?D3w<<O(//1$E04_0E&u- % 2 &%%
s    	G
 is not among supported models)r   r   r   N)r/   r   strr7   r   r>   anyr   is_supported_text_modelis_supported_sparse_model(is_supported_late_interaction_text_modelis_supported_image_model.is_supported_late_interaction_multimodal_model
ValueErrorr+   r?   clear)r!   rC   r.   r   rd   rw   s   ``    r"   r   z ModelEmbedder._embed_accumulator?  s    0	&010	&?B0	&PS0	&- 0	&d ,,E!99%@!;;EB!JJ5Q!::5A!PPQVW !E7*H!IJJ -  2288:KE4).;O*D& ; 	%%'r$   r   c                 >    | j                   |   j                  d      S )zGet next computed embedding from embedded batch

        Args:
            model_name: str - retrieve embedding from the storage by this model name

        Returns:
            NumericVector: computed embedding
        r   )r?   pop)r!   r   s     r"   r   zModelEmbedder._next_embed  s      "":.22155r$   c                    t        | t        j                        s| S | j                  }| j                  }| j
                  }t        t        j                  |      t        j                  |      t        j                  |      f      rt        j                  |||      S t        j                  |      rt        j                  |||      S t        j                  |      rt        | d      t        | d      )a  Resolve inference object into a model

        Args:
            data: models.VectorStruct - data to resolve, if it's an inference object, convert it to a proper type,
                otherwise - keep unchanged

        Returns:
            models.VectorStruct: resolved data
        )rd   r   r   )rd   r   r   z- does not support `InferenceObject` interfacer   )rE   r   InferenceObjectrd   objectr   r   r   r   r   r   r   r   Imager   r   )rw   r   rq   r   s       r"   r{   z'ModelEmbedder._resolve_inference_object  s     $ 6 67KZZ
,,55jA77
CFFzR
 ??%QQ11*=<<jwOOGG
S
|+XYZZJ<'EFGGr$   c                     t         S N)r   )r)   s    r"   rT   zModelEmbedder._get_worker_class  s    ""r$   r   )F   )r   N)NFFN)r   )#r4   r5   r6   rU   r   r   r	   r#   r   r   r   boolr7   rG   rj   r   r[   r/   r0   r   r   r`   r   VectorStructrh   r   ri   r   r   staticmethodr{   r8   r
   r   rT   r   r$   r"   r   r   *   s    +x(9: +S + 	)Xi%889  	
 
)	6 "&	/!U4Y#7#BCD/! /! 3-	/!
 
%S)^,i78	9/!h $%	tCN3Y>?@  "	
 
)	H ,0".2XT#y.)945X Y(X 	X
 X 'smX 
tCN#T#}*<%=y-W	XXt 3 3  > VW(,''(,37(,OR(,	(,TL(4 L(s L([_ L(\	6c 	6m 	6 H(;(; H@S@S H H@ #$':"; # #r$   r   )(rR   collectionsr   copyr   multiprocessingr   typingr   r   r   r	   r
   r   pydanticr   qdrant_client.httpr   qdrant_client.embed.commonr   #qdrant_client.embed.embed_inspectorr   qdrant_client.embed.embedderr   qdrant_client.embed.modelsr   r   !qdrant_client.embed.schema_parserr   qdrant_client.embed.utilsr   qdrant_client.fastembed_commonr    qdrant_client.parallel_processorr   r   qdrant_client.uploader.uploaderr   r   r   r   r$   r"   <module>r      sP    	 #  1 A A  % = > 1 I ? / 8 G 6& *Q# Q#r$   