
    Bvhl                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	Z	d dl
Z
d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZ  e j>                  e       Z!g dZ"ejF                  dejH                  dejJ                  diZ& G d de      Z' G d de      Z( G d de      Z) G d de      Z* G d de      Z+e*ddfe+ddfe)ddfdZ, G d d      Z- G d  d!      Z.y)"    N)Path)	Precision)float_to_float16_max_diff)FusionOptions)IOBindingHelper)	OnnxModel)optimize_model)torch_onnx_export)
GPT2ConfigGPT2LMHeadModel	GPT2ModelTFGPT2Model)
distilgpt2gpt2zgpt2-mediumz
gpt2-largezgpt2-xlMb@?g?g      @c                   ,     e Zd ZdZ fdZ fdZ xZS )GPT2ModelNoPastState2Here we wrap a class to disable past state output.c                 $    t         |   |       y Nsuper__init__selfconfig	__class__s     Z/RAG/venv/lib/python3.12/site-packages/onnxruntime/transformers/models/gpt2/gpt2_helper.pyr   zGPT2ModelNoPastState.__init__*            c                 (    t         |   |dd      S )NF)	use_cachereturn_dict)r   forwardr   	input_idsr   s     r   r$   zGPT2ModelNoPastState.forward-   s    wyEuMMr    __name__
__module____qualname____doc__r   r$   __classcell__r   s   @r   r   r   '   s    <!N Nr    r   c                   ,     e Zd ZdZ fdZ fdZ xZS )TFGPT2ModelNoPastStater   c                 2    d|_         t        | 	  |       y )NF)r"   r   r   r   s     r   r   zTFGPT2ModelNoPastState.__init__4   s      r    c                 &    t         |   |d      S )NF)r"   )r   callr%   s     r   r$   zTFGPT2ModelNoPastState.forward8   s    w|I|77r    r'   r-   s   @r   r/   r/   1   s    <!8 8r    r/   c                   <     e Zd ZdZ fdZed        Z fdZ xZS )MyGPT2ModelzMHere we wrap a class for Onnx model conversion for GPT2Model with past state.c                 $    t         |   |       y r   r   r   s     r   r   zMyGPT2Model.__init__?   r   r    c           	         t        | d   d   t        t        f      rt        | d         |k(  rt        | d   d         dk(  sJ g }t	        |      D ]Z  }|j                  t        j                  | d   |   d   j                  d      | d   |   d   j                  d      fd             \ | d   t        |      fS | S )N   r      )dim)	
isinstancetuplelistlenrangeappendtorchcat	unsqueeze)result	num_layerpresentis       r   post_processzMyGPT2Model.post_processB   s    fQilUDM2vay>Y.3vay|3D3IIIG9% II1a2215vay|A7P7PQR7ST & 1IuW~..r    c                     t         |   ||||d      }t        j                  || j                  j
                        S NF)position_idsattention_maskpast_key_valuesr#   r   r$   r4   rG   r   n_layerr   r&   rJ   rK   pastrC   r   s         r   r$   zMyGPT2Model.forwardU   sD    %)  ! 
 ''0C0CDDr    )	r(   r)   r*   r+   r   staticmethodrG   r$   r,   r-   s   @r   r4   r4   <   s+    W!  $E Er    r4   c                   ,     e Zd ZdZ fdZ fdZ xZS )MyGPT2LMHeadModelzSHere we wrap a class for Onnx model conversion for GPT2LMHeadModel with past state.c                 $    t         |   |       y r   r   r   s     r   r   zMyGPT2LMHeadModel.__init__c   r   r    c                     t         |   ||||d      }t        j                  || j                  j
                        S rI   rM   rO   s         r   r$   zMyGPT2LMHeadModel.forwardf   sD    %)  ! 
 ''0C0CDDr    r'   r-   s   @r   rS   rS   `   s    ]!	E 	Er    rS   c                   ,     e Zd ZdZ fdZ fdZ xZS )MyGPT2LMHeadModel_NoPaddinga  Here we wrap a class for Onnx model conversion for GPT2LMHeadModel with past state and no padding.
    When you always use batch_size=1 in inference, there is no padding in inputs. In such case, position_ids
    and attention_mask need no be in inputs.
    c                 $    t         |   |       y r   r   r   s     r   r   z$MyGPT2LMHeadModel_NoPadding.__init__x   r   r    c                 |    t         |   ||d      }t        j                  || j                  j
                        S )NF)rL   r#   rM   )r   r&   rP   rC   r   s       r   r$   z#MyGPT2LMHeadModel_NoPadding.forward{   s4    DeT''0C0CDDr    r'   r-   s   @r   rW   rW   r   s    
!E Er    rW   logitsTF
last_state)r   GPT2LMHeadModel_NoPaddingr   c                   0    e Zd Zd ZdefdZdefdZd Zy)
Gpt2Inputsc                 <    || _         || _        || _        || _        y r   )r&   rJ   rK   rP   )r   r&   rJ   rK   rP   s        r   r   zGpt2Inputs.__init__   s     +4.:WeFJ	r    returnc                     | j                   | j                  | j                  fD cg c]  }||	 }}| j                  r|j	                  | j                         |S c c}w r   )r&   rJ   rK   rP   extend)r   v
input_lists      r   to_listzGpt2Inputs.to_list   sZ    "&..$2C2CTEXEX!Yk!YA]^]ja!Y
k99dii(	 ls
   AAc                 ~    t        d | j                  | j                  | j                  | j                  fD              S )Nc              3   &   K   | ]	  }||  y wr    ).0rc   s     r   	<genexpr>z&Gpt2Inputs.to_tuple.<locals>.<genexpr>   s     u c1ghgtQ cs   )r;   r&   rJ   rK   rP   )r   s    r   to_tuplezGpt2Inputs.to_tuple   s3    u1B1BDDWDWY]YbYb cuuur    c                    d }| j                   ]| j                   j                  t        j                  k(  r*| j                   j	                  t        j
                        n| j                   }| j                  D cg c]"  }|j	                  t        j
                        $ }}t        | j                  | j                  ||      S c c}w )N)dtype)
rK   rm   r@   float16tofloat32rP   r^   r&   rJ   )r   rK   prP   s       r   to_fp32zGpt2Inputs.to_fp32   s    * ''--> ##&&U]]&;((  4899=9a5==)9=$..$*;*;^TRR >s   :'CN)	r(   r)   r*   r   r<   re   r;   rk   rr   rh   r    r   r^   r^      s(    K v% vSr    r^   c            "       p   e Zd ZdZedddej                  ej                  ej                  dfdededededed	ed
edej                  de	de	de	dej                  dej                  dej                  de	def d       Ze	 dAdedededededeeee   f   fd       Zed        ZedBd       ZedBd       ZedCd       ZedDd       Zeddddej                  ej                  ej                  fdede	d e	de	de	dej                  dej                  dej                  fd!       Ze	 	 	 dEd#       Zeg d$fd%ed&ee   fd'       ZedFd(ed)efd*       ZedFd(ed)efd+       Zed,        ZedGd-       Ze	 	 	 dHd(ed.eeej>                  f   d/eeee   f   d)ed0e	d1e	fd2       Z ed3        Z!ed4        Z"edd5d5d6d7ddddej                  ej                  ej                  d"ddfd8       Z#edd9ddddej                  ej                  ej                  d:d7d;fd<       Z$edId=       Z%edddg d>fdefd?       Z&y@)J
Gpt2HelperzEA helper class for Gpt2 model conversion, inference and verification.FT
batch_sizepast_sequence_lengthsequence_lengthnum_attention_headshidden_sizerD   
vocab_sizedevicern   has_position_idshas_attention_maskinput_ids_dtypeposition_ids_dtypeattention_mask_dtypeleft_side_paddingr`   c                    |rt         j                  nt         j                  }d| ||t        ||z        g}t	        |      D cg c]   }t        j
                  |||      dz  dz
  " }}t        j                  d|dz
  | |f||      }d}|
re||z   }t        j                  | |g||      }|dk\  rAt	        |       D ]3  }t        j                  d|dz
        }|r
d||d|f<   (d||||z
  df<   5 d}|	rQ|j                         j                  d	      dz
  }|j                  |dk  d       |dd|df   j                  |      }t        ||||      S c c}w )
zCreate random inputs for GPT2 model.
        Returns torch tensors of input_ids, position_ids, attention_mask and a list of past state tensors.
        r8   rm   r{   g       @      ?r   r7   )lowhighsizerm   r{   N)r@   rn   rp   intr>   randrandintonesrandomlongcumsummasked_fill_ro   r^   )ru   rv   rw   rx   ry   rD   rz   r{   rn   r|   r}   r~   r   r   r   
float_type
past_shape_rP   r&   rK   total_sequence_lengthrF   padding_lengthrJ   s                            r   get_dummy_inputszGpt2Helper.get_dummy_inputs   s   * '.U]]5==
 112

 `een_op_oZ[JjH3NQTT_opMMao.!
	 $8?$J!"ZZ23*N %)z*A%+^^A7Lq7P%QN(=>q/>/'9:VWq*?.*P*R'RS + )..077;a?L%%lQ&6:'+?+@(@ADDEWXL)\>4HHC qs   %Er   r   model_classc                    |j                   }|j                  }|j                  }|j                  }t        |   d   }	| ||	dk(  r|n|g}
d| |||z   t        ||z        g}|	|
i}t        |      D ]  }||dt        |      z   <    |S )zAReturns a dictionary with output name as key, and shape as value.r7   rZ   r8   present_)rx   ry   num_hidden_layersrz   MODEL_CLASSESr   r>   str)ru   rv   rw   r   r   rx   ry   rD   rz   output_namelast_state_shapepresent_state_shapeoutput_shapesrF   s                 r   get_output_shapeszGpt2Helper.get_output_shapes   s     %88((,,	&&
#K03 %1J{
  ?2112
 %&67y!A1DM*s1v-. " r    c                    |D ]|  }|| v sJ | |   }t        j                  ||         |j                         kD  s8t        j                  t        j                  ||         |j
                  |j                        | |<   ~ y )Nr   )numpyprodnelementr@   emptyrm   r{   )output_buffersr   keybuffers       r   auto_increase_buffer_sizez$Gpt2Helper.auto_increase_buffer_size  ss     C.(((#C(Fzz-,-0AA&+kkJJ}S12 ,,!=='s#	 !r    c                     |rt         j                  nt         j                  }i }| j                         D ]3  \  }}t        j                  t        j                  |      ||      ||<   5 |S )zpReturns a dictionary of output name as key, and 1D tensor as value. The tensor has enough space for given shape.r   )r@   rn   rp   itemsr   r   r   )r   r{   
is_float16	data_typer   nameshapes          r   get_output_bufferszGpt2Helper.get_output_buffers  sX     &0EMMU]]	(..0KD%#(;;uzz%/@	Z`#aN4  1r    c                    | d   j                         j                         }t        j                  ||d   z
        }|r.t        j                  |t        j                  |      dz   z        S t        j                  |      S )zGReturns the maximum difference between PyTorch and OnnxRuntime outputs.r   ư>)cpur   absamax)torch_outputsort_outputsrelativeexpected_outputsdiffs        r   diff_outputszGpt2Helper.diff_outputs%  sn     )+//1779yy)KN:;::deii0@&AD&HIJJ::d##r    c           	         t        j                  |d   | d   j                         j                         ||      }t        j	                  d|        |}t        |      dz
  }t        |      D ]g  }t        j                  |d|z      | d   |   j                         j                         ||      }t        j	                  d| d| d|        |xr |}i |s/t        j                  | |      }	t        j                  d|	d	       |S )
zReturns True if torch and ORT outputs are close for given thresholds, and False otherwise.
        Note: need kwargs since Gpt2BeamSearchHelper.compare_outputs has an extra parameter model_class
        r   )rtolatolz9PyTorch and OnnxRuntime output 0 (last_state) are close: r7   zPyTorch and OnnxRuntime layer z state (present_z) are close:z@PyTorch and OnnxRuntime results are not all close: max_abs_diff=.5f)
r   allcloser   loggerdebugr=   r>   rt   r   info)
r   r   r   r   kwargsis_closeis_all_close
num_layerslayermax_abs_diffs
             r   compare_outputszGpt2Helper.compare_outputs/  s   
 >>+a.-2B2F2F2H2N2N2PW[bfgPQYPZ[\%)
:&E~~AI&a '++-335	H LL9%@PQVPWWcdlcmno'4HL ' %22=+NLKKZ[ghkZlmnr    c                    d}d}g }g }t        t        |            D ]  }||   }|dk(  r| d   n
| d   |dz
     j                         j                         }	t        j                  ||	|d      }
|j                  t        j                  t        j                  |	|z
                     |xr |
}t        j                  |	      j                         rt        j                  d| d       t        j                  |	      j                         rt        j                  d| d       t        j                  |      j                         rt        j                  d	| d       t        j                  |      j                         rt        j                  d	| d       t        j                  ||	z
        }t        j                  |j                         |j                         }|j                  d
||   dd| d||   ddt#        |	|         d       |dk(  st        j                  t        j                  |d      |j                         }t        j                  t        j                  |	d      |	j                         }t        j$                  ||      } |j'                  t)        |            }|t)        |      |||fS )a  Compare outputs from PyTorch and OnnxRuntime

        Args:
            torch_outputs (Tuple[Torch.Tensor]): PyTorch model output
            ort_outputs (List[numpy.ndarray]): OnnxRuntime output
            atol (float, optional): Absolute tollerance. Defaults to 1e-06.

        Returns:
            is_all_close(bool): whether all elements are close.
            max_abs_diff(float): maximum absolute difference.
            messages(str): a list of debug message for each output
        TFr   r7   )r   r   zPyTorch output z has nanz has infzORT output zdiff=z.9fz index=z ort=z torch=N)axis)r>   r=   r   r   r   r?   r   r   isnananyr   r   isinffabsunravel_indexargmaxr   floatarray_equalindexmax)r   r   r   r   is_top1_matched	max_diffsmessagesrF   
ort_outputtorch_outputr   r   idxort_max_indextorch_max_indexmax_diff_output_indexs                   r   compare_outputs_v2zGpt2Helper.compare_outputs_v2J  s`    	s;'(A$QJ01QM!,M!<LQQRU<SXXZ``bL~~j,TPQRHUZZ		,2K(LMN'4HL{{<(,,.qc:;{{<(,,.qc:;{{:&**,{1#X67{{:&**,{1#X67::j<78D%%dkkmTZZ@COOS	#gcU%
37LGTYZfgjZkTlmpSqr Av % 3 3ELLRV4WYcYiYi j"'"5"5ell<VZ6[]i]o]o"p"'"3"3M?"S3 )6 !*I ?	N!
 	
r    onnx_model_pathverboseuse_external_data_formatc
                    | j                   }
|
j                  }t        j                  ddd|
j                  |
j
                  ||
j                  |d|||||	      }|j                         }t        j                         5   | | }ddd       t        |      D cg c]  }d| 	 }}t        |      D cg c]  }d| 	 }}d   j                  d   |
j                  k(  s!|d   j                  d   |
j
                  k(  sJ |d   j                  d   |
j                  k(  rd	nd
g|}dddd|d   dddi}|D ]
  }ddd||<    |D ]
  }ddd||<    dg}|rddd|d<   |j                  d       |rddd|d<   |j                  d       |j                  |       t        |      dk(  rt        |d         |k(  sJ t        j!                  d|j"                  j                   d|j$                  d   j                   d|d   j                   d|d   d   j                          t'        |      j(                  j+                  dd       |rt-        j.                         5 }t0        j2                  j5                  |d      }t'        |      j(                  j+                  dd       t7        | t9        |      |d|||ddd|       t;        j<                  |d      } t?        j@                  | |dd       ddd       yt7        | t9        |      |d|||ddd|       y# 1 sw Y   xY wc c}w c c}w # 1 sw Y   yxY w)z1Export GPT-2 model with past state to ONNX model.r7   F)ru   rv   rw   rx   ry   rD   rz   r{   rn   r|   r}   r~   r   r   Npast_r   r   r8   rZ   r[   r&   ru   seq_len)r   r7   past_seq_len)r7      total_seq_lenrJ   rK   zShapes: input_ids=z past=z output=z	 present=T)parentsexist_okz	gpt2.onnx   )
argsfexport_paramsinput_namesoutput_namesdynamic_axesopset_versiondo_constant_foldingr   r   )load_external_data)save_as_external_dataall_tensors_to_one_file)!r   rN   rt   r   rx   ry   rz   re   r@   no_gradr>   r   r?   rb   r=   r   r   r&   rP   r   parentmkdirtempfileTemporaryDirectoryospathjoinr
   r;   onnx
load_modelr   save)modelr{   r   r   r   r|   r}   r~   r   r   r   rD   dummy_inputsrd   outputsrF   
past_namespresent_namesr   r   r   r   tmp_dir_nametemp_onnx_model_paths                           r   export_onnxzGpt2Helper.export_onnx  s    #\\NN	!22!" & : :**((-1+1!5 3 
  "))+
]]_Z(G  ,1+;<+;aaSk+;
<16y1AB1AA8A31AB qz"f&7&7771:;K;KA;NRXRdRd;ddd$+AJ$4$4Q$76;L;L$LR^oano \i8O)<
 D%1n!EL !D%1o!FL " #m/;	+JL(~.1=/-RL)*/0:&7|q S_	%AAA !7!7!=!= >f\EVEVWXEYE_E_D``hipqrisiyiyhz  {D  EL  MN  EO  PQ  ER  EX  EX  DY  Z	
 	_$$**4$*G#,,.,')ww||L+'N$)*1177t7T!z**"& +!-!-"$(,-1# (<QUV#*.,0	' /.4 :&!"')) $().Q _ =BT /.s%   >L$L14L6%BL;$L.;Mr   c           	          t        d      }	t        | d||d|	d      }
|r5|rt        j                  |
       nd|vrd|d<    |
j                  dddi| |
j                  ||       |
S )	zHOptimize ONNX model with an option to convert it to use mixed precision.r   r   F)
model_type	num_headsry   	opt_leveloptimization_optionsuse_gpukeep_io_typesuse_symbolic_shape_inferTrh   )r   r	   rt   auto_mixed_precisionconvert_float_to_float16save_model_to_file)r   optimized_model_pathr   rx   ry   r   r  stager   r  ms              r   optimize_onnxzGpt2Helper.optimize_onnx  s      -V4)#!5
 #//2"&0.3F?+***SDSFS	13KLr    )AddLayerNormalizationSkipLayerNormalizationFastGeluEmbedLayerNormalization
onnx_modelop_block_listc                 n   | j                         D ch c]  }|j                   }}t        |      }|j                  |      }t        j                  d| d|        | j                         j                  d   j                  }d}| j                         }||v sJ ||   }d}	|j                  dk(  r|}	t        j                  d|j                          d}
|j                  D ]  }| j                  |      }
|
 n t        |
      }t        j                  d|j                   d	|        |d
k  }n/t        j                  d|j                   d|j                          g }g }|s|	|g}|	j                  g}||||d}t        j                  d|         | j                  dddi| |S c c}w )a?  Convert GPT-2 model to mixed precision.
           It detects whether original model has fp16 weights, and set parameters for float16 conversion automatically.
        Args:
            onnx_model (OnnxModel): optimized ONNX model
            op_block_list (List[str], optional): operators to compute in fp32. Defaults to ["Add", "LayerNormalization",
                                                 "SkipLayerNormalization", "FastGelu", "EmbedLayerNormalization"]
        Returns:
            parameters(dict): a dictionary of parameters used in float16 conversion
        z	fp32 op: z
 fp16 op: r   FNMatMulz#Found last MatMul node for logits: z3max diff of converting weights in last MatMul node : r   z-Failed to find MatMul node for logits. Found z	 of node )r  r!  node_block_listforce_fp16_initializersz!auto_mixed_precision parameters: r  Trh   )nodesop_typeset
differencer   r   graphoutputr   output_name_to_nodeinputget_initializerr   r   warningr  )r   r!  nodeop_full_setfp32_op_setfp16_op_setlogits_output_nameis_weight_fp16_precisionr-  last_matmul_nodeinitializerr.  max_diffr  r%  
parameterss                   r   r  zGpt2Helper.auto_mixed_precision  s   ( 1;0@0@0BC0Bt||0BC-(!,,[9i}J{mDE (--/66q9>> $) (<<>!%8888"#56<<8##KK=dii[IJK(88?* $ 1=HLLNtyykY[\d[efg'/$$NNJ4<<.Xabfbkbkalmn(/?/K/0M/445O +*.'?	

 	7
|DE+
++XTXZXa Ds   F2inputs
total_runsc                 `   t         j                  d       |j                         j                         }t	        j
                         5   | | }ddd       |dk(  rS g }t	        j
                         5  t        |      D ]A  }t        j                         } | | }|j                  t        j                         |z
         C 	 ddd       t        |      dz  t        |      z  }t         j                  dj                  t        |d                   |fS # 1 sw Y   xY w# 1 sw Y   axY w)zfRun inference of PyTorch model, and returns average latency in ms when total_runs > 0 besides outputs.zstart pytorch_inferenceNr     zPyTorch inference time = {} ms.2f)r   r   rr   re   r@   r   r>   timer?   sumr=   format)	r  r;  r<  rd   r  latencyr   startaverage_latencys	            r   pytorch_inferencezGpt2Helper.pytorch_inferenceb  s     	./ ^^%--/
]]_Z(G  ?N]]_:&		,tyy{U23 '  g,-G<5<<VOUZ=[\]''! _ _s   D3AD$D!$D-c                    t         j                  d       dt        j                  |j                  j                         j                               i}|j                  Tt        |j                        D ]<  \  }}t        j                  |j                         j                               |d| <   > |j                  >t        j                  |j                  j                         j                               |d<   |j                  >t        j                  |j                  j                         j                               |d<   | j                  d|      }|dk(  r|S g }t        |      D ]N  }t        j                         }	| j                  d|      }|j                  t        j                         |	z
         P t        |      dz  t        |      z  }
t         j                  d	j!                  t!        |
d
                   ||
fS )zcRun inference of ONNX model, and returns average latency in ms when total_runs > 0 besides outputs.zstart onnxruntime_inferencer&   Nr   rK   rJ   r   r>  z"OnnxRuntime Inference time = {} msr?  )r   r   r   ascontiguousarrayr&   r   rP   	enumeraterK   rJ   runr>   r@  r?   rA  r=   rB  )ort_sessionr;  r<  
ort_inputsrF   past_ir   rC  r   rD  rE  s              r   onnxruntime_inferencez Gpt2Helper.onnxruntime_inference|  s    	23!5#:#:6;K;K;O;O;Q;W;W;Y#Z[
;;"&v{{3	6*/*A*A&**,BTBTBV*W
U1#;' 4   ,+0+B+B6CXCXC\C\C^CdCdCf+gJ'(*).)@)@ATATAXAXAZA`A`Ab)cJ~&!oodJ7?z"AIIKE%//$
;KNN499;./ #
 g,-G<9@@Y^A_`aO++r    c           	      8    t        j                  | ||||||      S )z)Returnas IO binding object for a session.)r   prepare_io_binding)rK  r&   rJ   rK   rP   r   r   s          r   rP  zGpt2Helper.prepare_io_binding  s,     11
 	
r    c                 2    t        j                  | |||      S )z3Copy results to cpu. Returns a list of numpy array.)r   "get_outputs_from_io_binding_buffer)rK  r   r   return_numpys       r   rR  z-Gpt2Helper.get_outputs_from_io_binding_buffer  s      AA
 	
r    r   r   rS  include_copy_output_latencyc           	      h   t         j                  d       t        j                  | |j                  |j
                  |j                  |j                  ||      }| j                  |       t        j                  | |||      }|dk(  r|S g }	t        |      D ]g  }
t        j                         }| j                  |       |rt        j                  | |||      }
|	j                  t        j                         |z
         i t        |	      dz  t        |	      z  }t         j                  d|       ||fS )zUInference with IO binding. Returns outputs, and optional latency when total_runs > 0.z*start onnxruntime_inference_with_binded_ior   r>  z4OnnxRuntime with IO binding inference time = %.2f ms)r   r   rt   rP  r&   rJ   rK   rP   run_with_iobindingrR  r>   r@  r?   rA  r=   )rK  r;  r   r   r<  rS  rT  
io_bindingr   rC  r   rD  rE  s                r   $onnxruntime_inference_with_binded_ioz/Gpt2Helper.onnxruntime_inference_with_binded_io  s    	AB  22!!KK

 	&&z2 !CC
 ?z"AIIKE**:6*AA NN499;./ # g,-G<K_]O++r    c                 T   t        d|  dd      5 }t        j                  ||       d d d        t        j	                  d|  d       t        d|  dd      5 }t        j                  ||       d d d        t        j	                  d|  d       y # 1 sw Y   kxY w# 1 sw Y   /xY w)Nort_outputs_.picklewbz$ORT output are saved to ort_outputs_torch_outputs_z(Torch output are saved to torch_outputs_openpickledumpr   r   )rF   r   r   r   s       r   save_outputszGpt2Helper.save_outputs  s    L7+T2aKKQ' 3:1#WEFN1#W-t4KKq) 5>qcIJ 32 54s   BBBB'c                     t        d|  dd      5 }t        j                  ||       d d d        t        j	                  d|  d       y # 1 sw Y   #xY w)Ndummy_inputs_r[  r\  z!inputs are saved to dummy_inputs_r^  )rF   r  r   r   r   s        r   save_inputszGpt2Helper.save_inputs  sI    M!G,d3qKKa( 47s'BC 43s   A

Ar   i'  r7   c                    |j                   }t        j                  d| d| d| d| d|	 d| d       d}d	}d
}d}|r0t        j	                  |||||	      }t        j                  |||      }d}d}g }dg|z  }||z  }t        |      D ]E  }t        ||z        }t        j                  d|      }|dk(  rdnt        j                  d|      } t        j                  d|      }!t        j                  d|! d|  d       t        j                  |!| ||j                  |j                  |j                  |j                  |||
||||d      }"t        j!                  ||"      }#|rt        j#                  | |"      }$n1t        j	                  |!| |||	      }%t        j%                  | |"||%      }$t        j'                  |#|$|      \  }&}'}(})}*t)        j*                  |'      s|j-                  |'       |&r|dz  }|*r|dz  }||xx   dz  cc<   |rr|&spt        j                  d| d|! d|  d| d|' 
       t/        |)      D ]>  \  }}+t        j                  d| d| j1                         |   j2                   d|+        @ |st)        j*                  |'      s
|'d|z  kD  st        j5                  ||"       t        j7                  ||$|#       H |r*dD ,ci c]  },d|, t)        j8                  ||,      d  }-},ndD ,ci c]  },d|, d
 }-},|d z  |z  |-d!<   |D .cg c]
  }.|.d z  |z   c}.|-d"<   |d z  |z  |-d#<   |t;        |      z
  d z  |z  |-d$<   t        j                  d%| d&| d'|t;        |      z
   d(|        |d)|z  kD  r)t        j                  d*t        |dz  |z        d+d,       |-S c c},w c c},w c c}.w )-zKGenerate random inputs and compare the results of PyTorch and Onnx Runtime.zRunning parity test (atol=z, test_cases=z, runs=z, use_io_binding=z, model_class=z, is_float16=z) ...      r8   Nr   r7   z#Running parity test for batch_size=z past_sequence_length=z...T)r~   r   r   r   )r   z
test_case=z batch_size=z sequence_length=z	 MaxDiff=	z: Name=z, d   )2   Z   _   c   max_diff_percentile_r   nanr   top1_match_ratetop1_match_rate_per_rundiff_pass_ratenan_ratezParity Test Cases=z	; Passed=z; Nan=z; Top1_Matched=gffffff?zParity is good: passed rate=z.0f%)r   r   r   rt   r   r   r>   r   r   r   r   r   rx   ry   rN   rz   rF  rN  rX  r   r   r   r?   rI  get_outputsr   re  rb  
percentiler=   )/rK  r  r{   r   r   r   test_cases_per_runr<  use_io_bindingr   r|   r}   r~   r   r   r  r   enable_pickle_outputr   max_batch_sizemax_past_seq_lenmax_seq_lenr   max_output_shapespassed_test_casestop1_matched_casesmax_abs_diff_listtop1_matched_cases_per_runtotal_test_casesrF   run_idrw   rv   ru   r  r  r   r   r   r   r   r   r   messagerq   rC   xs/                                                  r   test_parityzGpt2Helper.test_parity  s   . #\\(m<N;OwWaVbbs  uC  tD  DR  S^  R_  _l  mw  lx  x}  ~	
  * < < 0+v{! (::;LfV`aN&'S:%5"-
:'(A//0F$nnQ<O).!1&..L\:] >:JLL5j\AWXlWmmpq &66$**""!! " /#5%9"& 7 L" !225,GG(>>{LY * < <(#! )MM~} --g{-N%;;|,!((6!Q&!"a'"*62a72| <
|;QRfQggx  zI  yJ  JS  T`  Sa  b #,H"5JAwKK"QCw{/F/F/H/K/P/P.QQST[S\ ]^ #6 $\)BlUX[_U_F_&&q,7'';@G )J eueu`a&qc*u/?/?@QST/UVY.Z[eu   BRRAQA,QC0%7AQFR$6$<?O$O !Sm,nSmaQW7I-ISm,n()#4s#:=M#M .5F1GG3NQaaz !1 2)<M;NfUehkl}h~U~T  @O  Pb  Oc  d	
 t&666KK6s;Ls;RUe;e7fgj6kklmn% S -os   #N9N>"Orj  rg      c                    |j                   }d}|r0t        j                  |||||      }t        j                  |||      }t        j	                  ||||j
                  |j                  |j                  |j                  |||||	|
|      }|rt        j                  | ||      \  }}|S t        j                  | |||      \  }}|S )zCGenerate random inputs and measure average latency of Onnx Runtime.N)r~   r   r   )r   rt   r   r   r   rx   ry   rN   rz   rN  rX  )rK  r  r{   r   r<  ry  r   r|   r}   r~   r   r   ru   rw   rv   r   r   r   r  r   rC  s                        r   test_performancezGpt2Helper.test_performance  s    ( #\\&880/6;M (::=&R\]N!22 &&NN+1!5 3 
" #99+|U_`JAw 	 $HH\>=*JAw r    c                     t         j                  ddd|j                  |j                  |j                  |j
                  |d||      j                         }t        j                  j                  | |      S )zJIT trace for TorchScript.r7   F)ru   rv   rw   rx   ry   rD   rz   r{   rn   r|   r}   )
rt   r   rx   ry   rN   rz   re   r@   jittrace)r  r   r{   r|   r}   rd   s         r   torchscriptzGpt2Helper.torchscript  ss      00!" & : :**nn((-1 1 
 ') 	 yyuj11r    rawfp32fp16int8c           
         |}t         j                  j                  |      rt        |      j                  d   }n|j                  d      d    |dk7  r|d|z   z  }|r|dz  }|rdddd	d
}d
D ]  }t         j                  j                  | |||   z         }	t         j                  j                  |	      sI||v r/	 t        j                  |	       t        j                  d|	        |t        j                  d| d|	         t         j                  j                  t         j                  j                  | |      |dz         t         j                  j                  t         j                  j                  | |dz         |dz         t         j                  j                  t         j                  j                  | |dz         |dz         t         j                  j                  t         j                  j                  | |d	z         |dz         d
S t         j                  j                  | |dz         t         j                  j                  | |dz         t         j                  j                  | |dz         t         j                  j                  | |dz         d
S # t        $ r0}
t        j                  d|	 d|
j                          Y d}
~
dd}
~
ww xY w)z=Build a  path name for given model based on given attributes.r   /r   r   _past _fp32_fp16_int8r  zRemoved the existed directory: zFailed to remove the directory r$  NzDirectory for z
 existed: z.onnxz
_fp32.onnxz
_fp16.onnxz
_int8.onnx)r   r   isdirr   partssplitr   existsshutilrmtreer   r   OSErrorstrerror)
output_dirmodel_name_or_pathr   has_past
new_folderremove_existing
model_namesuffixr  new_dires              r   get_onnx_pathszGpt2Helper.get_onnx_paths  s`    (
77==+,0177;JS!"%++#++J'!J'7SF=
'',,z:z@R3RS77>>'*!_4c"MM'2"KK*I'(ST nZL
7)$TU > ww||BGGLLZ$H*W^J^_GGLLZ'-AB- GGLLZ'-AB- GGLLZ'-AB- " 77<<
J,@AGGLLZ,-FGGGLLZ,-FGGGLLZ,-FG	
 	
-  ' c"KK*I'RTUVU_U_T`(abbcs   =-J	K(%KKN)r   )F)MbP?r  )r   )FFr   )r   )T)r   TF)TT)'r(   r)   r*   r+   rQ   r@   int32r   r{   boolrm   r^   r   r   r   dictr<   r   r   r   r   r   r   r  r  r   r  rF  rN  rP  rR  TensorrX  rb  re  r  r  r  r  rh   r    r   rt   rt      s   O !%#'',{{*/++,1KK"&>I>I!>I >I !	>I
 >I >I >I >I >I >I !>I >I "KK>I $kk>I  >I  
!>I >I@  -  !    	 
   
c49n	   D 	 	   $ $  4 3
 3
j 
 ).!%#'',{{*/++,1KKu u 	u
 #'u u !u u "KKu $kku un  "'"! !F $
CCCyC CJ ( ( ( (2 ,: ,3 , ,> 
 
( 
 
  !,10,0, S%,,./0, CcN+	0,
 0, 0, &*0, 0,d K K D D
 
  % ;;"[["%E EN 
 % ;;"[[4 4l 2 2"  -7:
 :
 :
r    rt   )/loggingr   r`  r   r  r   r@  pathlibr   r   r  r@   benchmark_helperr   rn   r   fusion_optionsr   io_binding_helperr   r   r   	optimizerr	   torch_onnx_export_helperr
   transformersr   r   r   r   	getLoggerr(   r   PRETRAINED_GPT2_MODELSFLOAT32FLOAT16INT8DEFAULT_TOLERANCEr   r/   r4   rS   rW   r   r^   rt   rh   r    r   <module>r     s    	          & - ( -   $ 6 L L			8	$W  vsNNC N9 N8[ 8!E) !EHE E$E/ E" *8T:"=x!O|T2S S>_
 _
r    