
    Bvh@                        d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	 d dl
mZmZ d dlmZmZ d dlmZ  ej$                  e      Z G d d      Z G d d	e      Z G d
 de      Z G d d      Zd Zd Zd Zedk(  r e        yy)    N)AttentionInputIDsAttentionOutputIDsMultiHeadAttentionInputIDsMultiHeadAttentionOutputIDs	Operators)helper
load_model)	NodeProto	OnnxModel)SymbolicShapeInferenceHelperc                       e Zd ZdedefdZdedz  fdZdedz  fdZdedz  fdZ	de
fd	Zd
ee   dee   ddfdZd
ee   dee   ddfdZdededdfdZdedz  fdZdde
ddfdZy)PackingAttentionBasemodelattention_op_typec                     || _         g | _        g | _        d| _        i | _        | j                   j                   j
                  j                  | _        || _        | j                   j                  |      | _
        y )NF)r   nodes_to_removenodes_to_addprune_graphnode_name_to_graph_namegraphnamethis_graph_namer   get_nodes_by_op_typeattention_nodes)selfr   r   s      Z/RAG/venv/lib/python3.12/site-packages/onnxruntime/transformers/convert_to_packing_mode.py__init__zPackingAttentionBase.__init__   sg     %
%'"$!&-/$$(JJ$4$4$:$:$?$?!2#zz>>?PQ    returnNc                 n   | j                   t        j                  k(  rt        j                  nt
        j                  }| j                         }|rt        |j                        |k  ry |j                  |   }| j                  D ].  }t        |j                        |k  s|j                  |   |k7  s. y  |S N)r   r   	ATTENTIONr   
MASK_INDEXr   KEY_PADDING_MASK_try_getting_first_attentionleninputr   )r   
mask_indexfirst_attention_nodeattention_masknodes        r   _try_getting_attention_maskz0PackingAttentionBase._try_getting_attention_mask$   s     %%)<)<< ((+<< 	
  $@@B#s+?+E+E'F*'T-33J? ((D4::*,

:0F.0X ) r   c                 R    t        | j                        dk  ry | j                  d   S )Nr   )r&   r   r   s    r   r%   z1PackingAttentionBase._try_getting_first_attention8   s)    t##$)##A&&r   c                     d }| j                   j                         D ]?  }|j                  t        j                  k(  s|j                  t        j
                  k(  s>|}A |S r!   )r   nodesop_typer   	LAYERNORMSKIPLAYERNORM)r   last_layernorm_noder+   s      r   _try_getting_last_layernormz0PackingAttentionBase._try_getting_last_layernorm>   sP    "JJ$$&D||y222dlliF]F]6]&*# ' #"r   c                     t               r!   NotImplementedErrorr.   s    r   _are_attentions_supportedz.PackingAttentionBase._are_attentions_supportedE       !##r   inputsoutputsc                 *   t        j                  t        j                  ||| j                  j                  t        j                              }d|_        | j                  j                  |       | j                  | j                  |j                  <   y Nr;   r<   r   com.microsoft)r   	make_noder   REMOVEPADDINGr   create_node_namedomainr   appendr   r   r   r   r;   r<   new_nodes       r   _insert_removepadding_nodez/PackingAttentionBase._insert_removepadding_nodeH   sp    ####,,Y-D-DE	
 *  *6:6J6J$$X]]3r   c                 *   t        j                  t        j                  ||| j                  j                  t        j                              }d|_        | j                  j                  |       | j                  | j                  |j                  <   y r>   )r   rA   r   RESTOREPADDINGr   rC   rD   r   rE   r   r   r   rF   s       r   _insert_restorepadding_nodez0PackingAttentionBase._insert_restorepadding_nodeT   sp    ##$$,,Y-E-EF	
 *  *6:6J6J$$X]]3r   token_offsetcumulative_sequence_lengthc                     t               r!   r7   )r   rL   rM   s      r   )_replace_attention_with_packing_attentionz>PackingAttentionBase._replace_attention_with_packing_attention`   r:   r   c                 x    | j                   t        j                  k(  r|j                  t        j
                     S y r!   )r   r   r"   r'   r   INPUT)r   r)   s     r   _get_input_to_remove_paddingz1PackingAttentionBase._get_input_to_remove_paddingc   s1    !!Y%8%88'--.?.E.EFFr   use_symbolic_shape_inferc                 p   t         j                  d       | j                         sy | j                         }|sy | j	                         }| j                         }|sy | j                  |      }|sy |dz   }|dz   }|dz   }|dz   }	| j                  ||g||||	g       | j                  j                  ||       t         j                  d       |j                  d   dz   }
| j                  |
|g|j                  d   g       | j                  j                  |j                  d   |
       t         j                  d	|j                   d
       | j                  ||       t         j                  d| j                   d| j                          | j                  j!                  | j"                         | j                  j%                  | j&                  | j(                         | j*                  r| j                  j+                          n2| j"                  s| j&                  r| j                  j-                          | j                  j/                          |r^t1        | j                  j                  d      }|j3                  | j                  j                  dd      }|r|| j                  _        y y y )Nz$start converting to packing model..._no_padding_token_offset_cumulated_seq_len_max_seq_lenz'inserted RemovePadding before Attentionr   _restore_inputz#inserted RestorePadding after last z layerz	replaced z with PackedverboseTF)
auto_mergeguess_output_rank)loggerdebugr9   r,   r%   r5   rR   rH   r   replace_input_of_all_nodesoutputrK   replace_output_of_all_nodesr1   rO   r   remove_nodesr   	add_nodesr   r   r   update_graphclean_shape_inferr   infer_shapes)r   rS   r*   r)   r4   input_to_remove_paddingoutput_without_paddingrL   cumulated_seq_lenmax_seq_lenrestorepadding_inputshape_infer_helperinferred_models                r   convertzPackingAttentionBase.converth   sw   ;<--/99;#@@B">>@" #'"C"CDX"Y&!8=!H.@36JJ->''$n5#\3DkR	
 	

--.EG]^>?  399!<?OO((*>)MPcPjPjklPmOno

../B/I/I!/LNbc:;N;V;V:WW]^_ 	66|EVWy!7!7 8TE[E[D\]^

 4 45

T..0L0LMJJ""$!!T%6%6JJ##%

$$&# ">djj>N>NXY!Z/<<TZZ=M=MZ^rw<xN#1

   $r   T)__name__
__module____qualname__r   strr   r,   r
   r%   r5   boolr9   listrH   rK   rO   rR   ro    r   r   r   r      s    Ri RC RS4Z ('i$.> '#Y-= #$4 $
Kc 
KT#Y 
KSW 
K
K$s) 
Kd3i 
KTX 
K$c $gj $os $C$J 
72 72 72r   r   c                   D     e Zd Zdef fdZdefdZdededdfdZ xZ	S )	PackingAttentionr   c                 B    t         |   |t        j                         y r!   )superr   r   r"   r   r   	__class__s     r   r   zPackingAttention.__init__   s    	 3 34r   r   c                    | j                   D ]  }t        j                  |d       yt        j                  |d       yt        j                  |d      }||dk7  r yt        |j                        t
        j                  kD  r|j                  t
        j                     s yt        |j                        t
        j                  kD  s|j                  t
        j                     r y y)Npast_present_share_bufferF	do_rotaryunidirectionalr   T)r   r   get_node_attributer&   r'   r   PASTPAST_SEQUENCE_LENGTH)r   r+   unidirection_attrs      r   r9   z*PackingAttention._are_attentions_supported   s    ((D++D2MNZ++D+>J ) < <TCS T ,1Ba1G4::!2!7!77

K\KaKa@bDJJ"3"H"HH

#4#I#IJ ) r   rL   rM   Nc           
         | j                   D ]  }t        |j                        t        j                  kD  r|j                  t        j                     nd}t        j                  t        j                  |j                  t        j                     |j                  t        j                     |j                  t        j                     |||g|j                  t        j                     g| j                  j!                  t        j                              }g }|j"                  D ]"  }|j$                  dv s|j'                  |       $ |j"                  j)                  |       d|_        | j,                  j'                  |       | j.                  j'                  |       | j0                  | j2                  |j$                  <    t4        j7                  dt        | j                                y )N r?   )	num_headsqkv_hidden_sizesscaler@   z0Converted %d Attention nodes to PackedAttention.)r   r&   r'   r   ATTENTION_BIASr   rA   r   PACKEDATTENTIONrQ   WEIGHTSBIASra   r   OUTPUTr   rC   	attributer   rE   extendrD   r   r   r   r   r^   info)r   rL   rM   	attentionattention_biaspacked_attention
attributesattrs           r   rO   z:PackingAttention._replace_attention_with_packing_attention   s   --I y'*;*J*JJ  1 @ @A 
  &//))OO$5$;$;<OO$5$=$=>OO$5$:$:; ." #))*<*C*CDEZZ001J1JK  J!++99 JJ%%d+ , &&--j9&5#$$%56  ''	2BFBVBVD(()9)>)>?; .> 	FDL`L`Habr   )
rq   rr   rs   r   r   ru   r9   rt   rO   __classcell__r}   s   @r   ry   ry      s;    5i 54 $ cc  cgj  cos  cr   ry   c                   v     e Zd Zdef fdZdedefdZdedefdZde	fdZ
d	ed
eddfdZdedz  fdZ xZS )PackingMultiHeadAttentionr   c                 B    t         |   |t        j                         y r!   )r{   r   r   MULTI_HEAD_ATTENTIONr|   s     r   r   z"PackingMultiHeadAttention.__init__   s    	 > >?r   indexr   c                     t        |j                        |kD  r:t        |j                  |         dkD  rt        j                  d| d| d|        yy)'Check a node does not have given input.r   znode input  (0) is not supported in PackedMultiHeadAttention: FT)r&   r'   r^   errorr   r+   r   r   s       r   _check_empty_inputz,PackingMultiHeadAttention._check_empty_input   sP    tzz?U"4::e$%){5'D69ijniopqr   c                     t        |j                        |kD  r:t        |j                  |         dkD  rt        j                  d| d| d|        yy)r   r   znode output r   r   FT)r&   ra   r^   r   r   s       r   _check_empty_outputz-PackingMultiHeadAttention._check_empty_output   sQ    t{{e#4;;u%&*|E7"TF:jkojpqrr   r   c                 f   | j                   D ]!  }|j                  D ]8  }|j                  dvst        j	                  d|j                   d|          y |j
                  t        j                     r4|j
                  t        j                     st        j	                  d        y| j                  |t        j                  d      re| j                  |t        j                  d      rD| j                  |t        j                  d      r#| j                  |t        j                  d      r" y y)	Nr   mask_filter_valuer   znode attribute z/ is not supported in PackedMultiHeadAttention: Fz=packed kv format is not supported in PackedMultiHeadAttentionpast_keypresent_keyT)r   r   r   r^   r   r'   r   KEYVALUEr   PAST_KEY
PAST_VALUEr   r   PRESENT_KEYPRESENT_VALUE)r   r+   r   s      r   r9   z3PackingMultiHeadAttention._are_attentions_supported   s    ((D99$OOLL?499+=lmqlr!st  '
 zz4889$**MgMmMmBn\] ''.H.Q.QS]^++D2L2W2WYcd,,T3N3Z3Z\ij,,T3N3\3\^kl! )$ r   rL   rM   Nc                 F   d}| j                   D ]P  }t        |j                        t        j                  kD  r|j                  t        j                     nd}t        j                  t        j                  |j                  t        j                     |j                  t        j                     |j                  t        j                     |j                  t        j                     |||g|j                  t        j                     g| j                   j#                  t        j                              }g }|j$                  D ]"  }|j&                  dv s|j)                  |       $ |j$                  j+                  |       d|_        | j.                  j)                  |       | j0                  j)                  |       | j2                  | j4                  |j&                  <   |s| j                   j7                  |t        j                        }	|	s|	j8                  dk(  st        |	j                        dk(  s1|	j                  j)                  |       |dz  }S t:        j=                  d	t        | j                                t:        j=                  d
|       y )Nr   r   r?   r   r@   GatedRelativePositionBias      zBConverted %d MultiHeadAttention nodes to PackedMultiHeadAttention.z=Converted %d GatedRelativePositionBias nodes to packing mode.)r   r&   r'   r   r   r   rA   r   PACKED_MULTI_HEAD_ATTENTIONQUERYr   r   r   ra   r   r   r   rC   r   r   rE   r   rD   r   r   r   r   
get_parentr1   r^   r   )
r   rL   rM   gated_relative_pos_bias_countmhar   
packed_mhar   r   rel_pos_bias_nodes
             r   rO   zCPackingMultiHeadAttention._replace_attention_with_packing_attention  s   ()%''C syy>$>$M$MM 		4CCD 
  ))55II8>>?II8<<=II8>>?II8==> ." $?$F$FGHZZ001V1VWJ J99 KK%%d+ &   ''
3 /J$$Z0  '',<@<P<PD((9 $(JJ$9$9#?Y?h?h$i!%)115PP-3349%++22<@1Q61S (V 	XZ]^b^r^rZstSUrsr   c                 |    | j                   j                  |d      }|r|j                  dk(  r|j                  d   S y )Nr   MatMul)r   r   r1   r'   )r   r)   matmuls      r   rR   z6PackingMultiHeadAttention._get_input_to_remove_padding4  s8    &&';Q?fnn0<<?"r   )rq   rr   rs   r   r   intrt   r   r   ru   r9   rO   rR   r   r   s   @r   r   r      st    @i @c  s # 4 *.tc .tgj .tos .t`C$J r   r   c                   *    e Zd ZdefdZddeddfdZy)PackingModer   c                     || _         y r!   )r   )r   r   s     r   r   zPackingMode.__init__=  s	    
r   rS   r   Nc                    | j                   j                  t        j                        re| j                   j                  t        j                        rt
        j                  d       y t        | j                         }|j                  |      S | j                   j                  t        j                        r&t        | j                         }|j                  |      S t
        j                  d       y )NzRPacking mode does not support both Attention and MultiHeadAttention in same graph.zPPacking mode requires either Attention or MultiHeadAttention node in onnx graph.)
r   r   r   r"   r   r^   r   ry   ro   r   )r   rS   packings      r   ro   zPackingMode.convert@  s    ::**9+>+>?zz..y/M/MNqr&tzz2G??#;<<ZZ,,Y-K-KL/

;G??#;<<LLklr   rp   )rq   rr   rs   r   r   ru   ro   rw   r   r   r   r   <  s!    i   r   r   c                  R   t        j                  d      } | j                  ddt        d       | j                  ddt        d       | j                  d	d
dd       | j	                  d
       | j                  dd
dd       | j	                  d
       | j                         }|S )Nz_Convert to packing mode tool for ONNX Runtime. It converts BERT like model to use packing mode.)descriptionz--inputTzinput onnx model path)requiredtypehelpz--outputzoptimized onnx model pathz	--verboseF
store_truezshow debug information.)r   actionr   rZ   z--use_external_data_formatz4use external data format to store large model (>2GB)use_external_data_format)argparseArgumentParseradd_argumentrt   set_defaults
parse_args)parserargss     r   _parse_argumentsr   O  s    $$uF 	DsAXY

TB]^
eLOhi
&
$C	   7DKr   c                 d    | rt        j                  dd       y t        j                  d       y )NDEBUGz8[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s)levelfmtz%(funcName)20s: %(message)s)r   )coloredlogsinstallrZ   s    r   _setup_loggerr   g  s*    J	

 	 =>r   c                     t               } t        | j                         t        j	                  d|         t
        j                  j                  | j                        t
        j                  j                  | j                        k(  rt        j                  d       t        | j                        }t        t        |            }|j                          |j                  j!                  | j                  | j"                         y )Nz
arguments:zYSpecified the same input and output path. Note that this may overwrite the original modelr   )r   r   r[   r^   r_   ospathrealpathr'   ra   warningr	   r   r   ro   r   save_model_to_filer   )r   r   packing_modes      r   mainr   q  s    D$,,
LL:dV$%	ww

#rww'7'7'DDrstzz"Ey/0L))$++PTPmPm)nr   __main__)r   loggingr   r   	constantsr   r   r   r   r   onnxr   r	   
onnx_modelr
   r   rm   r   	getLoggerrq   r^   r   ry   r   r   r   r   r   rw   r   r   <module>r      s      	   $ + ;			8	$F2 F2R6c+ 6cr^ 4 ^B &0?o  zF r   