
    Bvh,                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlm	Z
 d dlZd dlZd dlmZ d dlmZmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d dl Z! ejD                  d
      Z#ddZ$ddZ%d Z&	 	 d	 	 	 	 	 	 	 	 	 	 	 ddZ'ddZ(g fddZ)e*dk(  r7dZ+ejX                  jW                  e+        ejZ                  e+        e)        yy)    )annotationsN)setup_logger)get_rankget_size)add_io_bindings_as_ortvaluesconvert_inputs_for_ort%get_merged_sample_with_past_kv_inputsget_sample_inputsget_sample_with_past_kv_inputsverify_ort_inputs)setup_torch_model)make_dynamic_cache)
AutoConfig)DynamicCache c                J    | j                   rdnd\  }}|j                  }|||fS )N)      )r   r   )use_past_kvmax_position_embeddings)argsconfigpast_sequence_lengthcurr_sequence_lengthmax_sequence_lengths        \/RAG/venv/lib/python3.12/site-packages/onnxruntime/transformers/models/llama/llama_parity.pyget_sequence_lengthsr   %   s4    ;?;K;KQW.. 88!57JJJ    c                \   t               }d}t        | |      \  }}}| j                  r5t        || j                  ||||| j
                  | j                  d|
      }|S | j                  r(t        || j                  ||| j
                  d|      }|S t        || j                  ||d      }|S )N   T)seq_lenpast_seq_lenmax_seq_lenuse_fp16use_buffer_sharereturn_dict
world_size)r$   r&   r'   )r&   )
r   r   mergedr	   devicer$   r%   r   r   r
   )r   r   r'   
batch_sizer   sequence_lengthr   inputss           r   
get_inputsr-   +   s    JJAUVZ\bAc>/+>{{6KK#-+]]!22!
2 M 
		/KK]]!
 M #64;;
OaefMr   c                   t        | t        t        t        f      r| S t        | t              rt	        d | D              S t        | t
              r| D cg c]  }t        |       c}S t        | t              r| D ch c]  }t        |       c}S t        | t              r-| j                         D ci c]  \  }}|t        |       c}}S t        | t        j                        r| j                         S t        | d      r| j                         S t        | t              r=t!        t        t        t#        | j$                  | j&                  d                        S t)        dt+        |              c c}w c c}w c c}}w )Nc              3  2   K   | ]  }t        |        y w)N)torch_deepcopy).0vs     r   	<genexpr>z!torch_deepcopy.<locals>.<genexpr>R   s     61^A&s   cloneF)strictz(torch_deepcopy not implemented for type )
isinstanceintfloatstrtuplelistr0   setdictitemsnpndarraycopyhasattrr4   r   r   zip	key_cachevalue_cacheNotImplementedErrortype)valuer2   ks      r   r0   r0   N   s5   %#uc*+%6666%+015aq!511%+015aq!511%16?A>!$$??%$zz|ug{{}%&!.c%//5K\K\ej6k1l"mnn  HeV
WW 21?s   E9=E>5Fc                   |}|Gt        | ||| j                  rt        j                  nt        j                  | j
                        \  }}t        | |      }d|v rJt        j                  t        j                        t        j                  d      k\  rt        |d         |d<   | j                  dk7  rt        j                  j                          t        j                         } |di t!        |      j"                  j%                         j'                         j)                         }	| j                  dk7  rt        j                  j                          t        j                         }
t*        j-                  d|
|z
   d       | j.                  r!|~t        j                  j1                          t3        | |      \  }}}t5        || j6                  ||      }| j                  j9                          d}|d	k(  r|d
| j:                  if}t=        j>                  | j@                  t=        jB                         |g      }tE        ||      }| j                  dk7  rtG        ||| j                  tI        | j:                        | j6                  |      \  }}|jK                          t        j                         }|jM                  |       |jO                          t        j                         }
|jQ                         d   }~n?t        j                         }|jS                  d |      }t        j                         }
|d   }t*        j-                  d|
|z
   d       d| j@                  v sd| j@                  v rdnd}tU        jV                  |	|||      }t*        jY                  d|        |s.t*        jY                  dtU        jZ                  |	|z
                |S )Ntorch_dtyper)   past_key_valuesz4.45cpuzPyTorch took z s)r%   r"   r#   ExecutionProviderCUDAExecutionProvider	device_id)sess_options	providers)
ort_inputsr)   rQ   r%   kv_cache_ortvaluesr   zONNX Runtime took int4int8g      4@g      ?)rtolatolz,Are PyTorch and ONNX Runtime results close? z
Max diff:  ).r   r$   torchfloat16float32r)   r-   pvVersiontransformers__version__r   execution_providercudasynchronizetimer0   logitsdetachrN   numpyloggerinfo	small_gpuempty_cacher   r   r%   upperrankortInferenceSessiononnx_model_pathSessionOptionsr   r   r7   synchronize_inputsrun_with_iobindingsynchronize_outputscopy_outputs_to_cpurunr?   allclosewarningmax)r   locationuse_auth_tokenrU   pytorch_modelr   py_modelr,   
start_time
pt_outputsend_timer   _r   ep	ort_model
io_bindingort_outputstolparitys                       r   verify_parityr   d   sU    H,*.--U]];;
 f%FF"rzz,2J2J'KrzzZ`Oa'a$6v>O7P$Q ! %'

 J 3N623::AACGGIOOQJ%'

 yy{H
KK-: 56b9:~~(.

  4Hf3U0!0#..)'	F ##))+,,=	>B	$$;		*+$$'')$I
 y&1F %')E**$))n!221*
&
& 	%%'YY[
$$Z0&&(99; 446q9 YY[
mmD&199;!!n
KK$X
%:$;2>? 4///6T=Q=Q3Q#W[C[[[sEF
NNA&JKBFF:+C$D#EFGr   c                   t        j                         }|j                  dddd       |j                  dddt        j                  j                  d      d	
       |j                  dddt        j                  j                  d      d
       |j                  ddddg dd       |j                  dddd       |j                  d       |j                  dddd       |j                  d       |j                  dd dd!       |j                  d"       |j                  d#dd$       |j                  d%       |j                  d&d'dg d(d)*       |j                  d+dt        d,d-.       |j                  d/dd0       | g k(  r|j                         n|j                  |       }|j                  d1v s|j                  d2k(  r|j                  dk(  r	d3|_	        |S d4|_	        |S )5Nz-mz--model_nameFzModel name in Hugging Face)requiredhelpz-tz--torch_model_directory.zMPath to folder containing PyTorch model and associated files if saved on disk)r   defaultr   z-oz--onnx_model_pathTzSPath to ONNX model (with external data files saved in the same folder as the model)z-epz--execution_providerrN   )rN   rc   rocmz(Execution provider to verify parity with)r   r   choicesr   z-vz	--verbose
store_truezPrint verbose logs)actionr   )verbosez-pz--use_past_kvzfUse past key and past value as inputs to the model. Necessary for decoder_with_past_model.onnx models.)r   z-gz--use_buffer_sharezWUse if model has GroupQueryAttention and you want to enable past-present buffer sharing)r%   z--mergedz2Use merged model (i.e. decoder_merged_model.onnx).)r(   z-fpz--precision)rV   rW   fp16fp32zPrecision of model)r   r   r   z--cache_dirz./model_cachezQmodel cache dir to override default HF cache dir to avoid overflood the /home dir)r   rG   r   r   z--small_gpuzhLoad the llama in GPU every time for parity_check if it's running in a machine which GPU memory < 36GB. >   r   rW   rV   r   r   )argparseArgumentParseradd_argumentospathjoinset_defaultsr9   
parse_args	precisionrb   )argvparserr   s      r   get_argsr      sT   $$&F
)	   !S!\   S!b   '7   !	   &
u	   E*
f	   /
A  
 u%
0!   `   w   #'"*6&2C2CD2ID
 >>--$..F2JtOfOfjoOo 	 	N
 K  	N
 Kr   c                4   t        |       }t        |j                         t        j	                  d|        t               }t        |d|j                  dk(         ||_        t        |d|j                  dk(  rdnd|        t        |dt        j                  |j                               |j                  t        j                  j!                  d      k(  }|r|j"                  n|j                  }i }|j$                  st'        ||||       y d x}}|j(                  sGt+        ||||j,                  rt        j.                  nt        j0                  |j                  	      \  }}d
|_        t'        ||||||      }d|_        t'        ||||||       y )NzArguments: r$   r   device_namerN   zcuda:r)   r   rK   F)r}   r   T)r   r   r   ri   rj   r   setattrr   rn   rb   r[   r)   r   torch_model_directoryr   r   r   
model_namer(   r   rk   r   r$   r\   r]   r   )r   r   rn   r|   r{   rU   r   llamas           r   mainr   +  s`   D>D
KK+dV$%:D D*dnn67DID-$*A*AU*JRWX\W]P^_D(ELL)9)9:;//277<<3DDN"0td6P6PH;;dHn6HI~~-.2mmU]]{{MFE !*(N,>e\b

  dHn6HX]flmr   __main__r    )r   argparse.Namespacer   r   )NN)r   r   r{   r9   r|   boolrU   r=   r}   zNone | torch.nn.Moduler   zNone | AutoConfig)r   z	list[str]).
__future__r   r   loggingr   re   rh   r?   packaging.versionversionr^   r[   r`   benchmark_helperr   dist_settingsr   r   llama_inputsr   r   r	   r
   r   r   llama_torchr   (models.torch_export_patches.cache_helperr   r   transformers.cache_utilsr   onnxruntimero   	getLoggerri   r   r-   r0   r   r   r   __name__seedrandommanual_seedrZ   r   r   <module>r      s    #   	      ) ,  * G # 1 			2	K FX6 -1 $`
`` ` 	`
 *` `FaH  $nN zDIINN4EdF	 r   