
    Bvh                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlZd dl	Z	d dl
Z
d dlmZ dddddd	d
ddd	ZdddddZd Zd Zd@dZdededefdZdedededefdZdededededef
d Z	 dAded!ed"efd#Zd$efd%Z	 dAded!efd&Z	 dAdeded'ededed(ed)eded*ed+ed,ed"efd-Z	 	 	 dBdededed.efd/Z	 	 dCded!efd0Z	 	 dCdeded'ededed(ed)eded*ed+ed.ed"efd1Z	 	 dDd2ed3ededed(ed)eded*ed+ed4ed5ed6efd7Z 	 	 	 dEd2ed3edededed(ed)eded*ed+ed4ed5ed6ed"efd8Z!	 	 	 dEd2ed3ededed(ed)eded*ed+ed4ed5ed"efd9Z"	 	 	 dEd2ed3ededed(ed)eded*ed+ed4ed5ed"efd:Z#	 dFdededededed(ed)eded*ed+ed"efd;Z$d< Z%dFd=Z&d> Z'e(d?k(  rd dl)Z)	  e'        yy# e*$ r!  e)jV                   ejX                           Y yw xY w)G    N)Pathmeasure_memoryzrunwayml/stable-diffusion-v1-5zstabilityai/stable-diffusion-2z stabilityai/stable-diffusion-2-1z+stabilityai/stable-diffusion-xl-refiner-1.0z/stabilityai/stable-diffusion-3-medium-diffusersz'stabilityai/stable-diffusion-3.5-mediumz&stabilityai/stable-diffusion-3.5-largez black-forest-labs/FLUX.1-schnellzblack-forest-labs/FLUX.1-dev)	1.5z2.02.1zxl-1.0z3.0Mz3.5Mz3.5LzFlux.1SzFlux.1DCUDAExecutionProviderROCMExecutionProviderMIGraphXExecutionProviderTensorrtExecutionProvider)cudarocmmigraphxtensorrtc                      g d} d}| |fS )N)
z.a photo of an astronaut riding a horse on marsz@cute grey cat with blue eyes, wearing a bowtie, acrylic paintingzia cute magical flying dog, fantasy art drawn by disney concept artists, highly detailed, digital paintingzdan illustration of a house with large barn with many cute flower pots and beautiful blue sky sceneryzgone apple sitting on a table, still life, reflective, full color photograph, centered, close-up productzWbackground texture of stones, masterpiece, artistic, stunning photo, award winner photozSnew international organic style house, tropical surroundings, architecture, 8k, hdrznbeautiful Renaissance Revival Estate, Hobbit-House, detailed painting, warm colors, 8k, trending on Artstationzcblue owl, big green eyes, portrait, intricate metal design, unreal engine, octane render, realisticzldelicate elvish moonstone necklace on a velvet background, symmetrical intricate motifs, leaves, flowers, 8kz*bad composition, ugly, abnormal, malformed )promptsnegative_prompts     d/RAG/venv/lib/python3.12/site-packages/onnxruntime/transformers/models/stable_diffusion/benchmark.pyexample_promptsr   )   s    G COO##    c                       y)N)zwarm upbadr   r   r   r   warmup_promptsr   <   s    r   c                      t        d|| |      S )NT)is_gpufuncmonitor_typestart_memoryr   )r   r   r   s      r   measure_gpu_memoryr   @   s    D|Zfggr   
model_name	directorydisable_safety_checkerc                 p   ddl m}m} dd l}|Ft        j
                  j                  |      sJ |j                         }|j                  |||      }n|j                  | d|d      }|j                  |j                  j                        |_
        |j                  d       |rd |_        d |_        |S )Nr   )DDIMSchedulerOnnxStableDiffusionPipeline)providersess_optionsonnxT)revisionr&   use_auth_tokendisable)	diffusersr$   r%   onnxruntimeospathexistsSessionOptionsfrom_pretrainedfrom_config	schedulerconfigset_progress_bar_configsafety_checkerfeature_extractor)	r    r!   r&   r"   r$   r%   r.   session_optionspipes	            r   get_ort_pipeliner<   D   s    Dww~~i(((%446*::( ; 
 +::	 ; 
 #..t~~/D/DEDN   ."!%Kr   enable_torch_compileuse_xformersc                    d| v rddl m} |j                  | t        j                        j                  d      }|rQ|j                  j                  t        j                         t        j                  |j                  dd	      |_        |S d
| v rddl m	} |j                  | t        j                        j                  d      }|rQ|j                  j                  t        j                         t        j                  |j                  dd	      |_        |S ddl m
}m} ddlm}	m}
 |j                  | |
      j                  d      }|j                  j                  |	       |r|j                          |rwt        j                  |j                        |_        t        j                  |j                        |_        t        j                  |j                         |_        t#        d       |j%                  |j&                  j(                        |_        |j+                  d       |rd |_        d |_        |S )NFLUXr   )FluxPipeline)torch_dtyper   )memory_formatzmax-autotuneT)mode	fullgraphzstable-diffusion-3)StableDiffusion3Pipeline)r$   StableDiffusionPipeline)channels_lastfloat16z)Torch compiled unet, vae and text_encoderr+   )r-   rA   r3   torchbfloat16totransformerrH   compilerF   r$   rG   rI   unet*enable_xformers_memory_efficient_attentionvaetext_encoderprintr4   r5   r6   r7   r8   r9   )r    r"   r=   r>   rA   r;   rF   r$   rG   rH   rI   s              r   get_torch_pipelinerT   b   s   *++JENN+SVVW]^e.A.AB$}}T-=-=N^bcDz)6'77
PUP^P^7_bbcije.A.AB$}}T-=-=N^bcD@,"22:72SVVW]^DIILL}L-779MM$)),	==*!MM$*;*;<9:"..t~~/D/DEDN   ."!%Kr   engine
batch_sizestepsc                 v    |j                  d      d   j                  dd      }|  d| d| d| |rdz   S d	z   S )
N/zstable-diffusion-sd__b_s _safe)splitreplace)rU   r    rV   rW   r"   short_model_names         r   get_image_filename_prefixrd      sV    !'',R0889LdSXQ'(:,b@J`Bnnfmnnr   image_filename_prefixskip_warmupc                    
 ddl m} t         |      sJ t               \  }} 
fd}t	        |	||      }t	        |	||      } |        g }t        |      D ]  \  }}||k\  r nt        j                         }  |gz  |gz        j                  }t        j                         }||z
  }|j                  |       t        d|dd       t        |      D ]  \  }}|j                  | d| d| d	       !  dd
lm} d|||t        |      t        |      z  t        j                   |      ||dS )Nr   )r%   c                  P    ry t               \  } } | gz  |gz         y )Npromptheightwidthnum_inference_stepsr   r   )rj   negativerV   rk   r;   rf   rW   rl   s     r   warmupz run_ort_pipeline.<locals>.warmup   s;    )+8j( %%J3	
r   ri   Inference took .3f secondsr\   .jpg__version__r.   rU   versionrk   rl   rW   rV   batch_countnum_promptsaverage_latencymedian_latencyfirst_run_memory_MBsecond_run_memory_MB)r-   r%   
isinstancer   r   	enumeratetimeimagesappendrS   saver.   rv   sumlen
statisticsmedian)r;   rV   re   rk   rl   rW   rz   ry   r   memory_monitor_typerf   r%   r   r   rp   first_run_memorysecond_run_memorylatency_listirj   inference_startr   inference_endlatencykimageort_versions   `` ```    `                r   run_ort_pipeliner      sr    6d7888.0G_

 

 **=v|T*+>U
HLw'	6))+8j( %,-
:
 & 	 		/1G$}H56!&)HAuJJ/0!AaS=> * ($ 7   ""|,s</@@$++L9/ 1 r   returnc                     |s|rd| in	d| g|z  ini }t         j                  j                         r(t        j                  d      j	                  d      |d<   |S )Nr   r   )device{   	generator)rJ   r   is_available	Generatormanual_seed)r   use_num_images_per_promptis_fluxrV   kwargss        r   get_negative_prompt_kwargsr      se      ) 0#o%6%CD   zz #ooV<HHM{Mr   c                 b   
 t               \  }}dd l}t         |j                         
fd}t	        |	||      }t	        |	||      } |        t        j                  d       g }t        |      D ]  \  }}||k\  r nt
        j                  j                          t        j                         }t        |d      }  d|gz  d|j                  }t
        j                  j                          t        j                         }||z
  }|j                  |       t        d|dd       t        |      D ]  \  }}|j                  | d| d| d	       !  d
t
        j                   ||t#        |      t%        |      z  t'        j(                  |      ||dS )Nr   c                  d    ry t               \  } }t        |d      } d| gz  	d| y )NFrj   rk   rl   rm   r   r   r   )
rj   ro   extra_kwargsrV   rk   r   r;   rf   rW   rl   s
      r   rp   z"run_torch_pipeline.<locals>.warmup  sE    )+1(E7JWqVHz)&[`qdpqr   Fr   rq   rr   rs   r\   rt   rJ   rw   r   )r   r-   r   rA   r   rJ   set_grad_enabledr   r   synchronizer   r   r   r   rS   r   rv   r   r   r   r   )r;   rV   re   rk   rl   rW   rz   ry   r   r   rf   r   r   r-   rp   r   r   r   r   rj   r   r   r   r   r   r   r   r   s   `` ```    `                @r   run_torch_pipeliner      s     /0G_y556Gr r **=v|T*+>U
H	5!Lw'	6

 ))+1/5'S]^ 
8j( %	

 
 & 	 	

 		/1G$}H56!&)HAuJJ/0!AaS=> *' (. $$ ""|,s</@@$++L9/ 1 r   r&   rk   rl   rz   ry   tuningc                 L   |}|r|dv r|dddf}t        j                          }t        | |||      }t        j                          }t        d||z
   d       t        d| |||      }t	        ||||||||	|
||      }|j                  | ||j                  dd	      |d
d       |S )N)r   r	      )tunable_op_enabletunable_op_tuning_enableModel loading took rs   ortrf   ExecutionProviderr_   Fr    r!   r&   r"   enable_cuda_graph)r   r<   rS   rd   r   updaterb   )r    r!   r&   rV   r"   rk   rl   rW   rz   ry   r   r   r   rf   provider_and_options
load_startr;   load_endre   results                       r   run_ortr   ;  s      $(PP (_`*abJJ	3GI_`Dyy{H	: 56h
?@5eZUZ\rsF MM$" (()<bA&<!&	
 Mr   use_io_bindingc                     ddl m} |4t        j                  j	                  |      r|j                  |||      }n&|j                  | d||      }|j                  |       |rd |_        d |_        |S )Nr   )ORTPipelineForText2Image)r&   r   T)exportr&   r   )	optimum.onnxruntimer   r/   r0   r1   r3   save_pretrainedr8   r9   )r    r!   r&   r"   r   r   pipelines          r   get_optimum_ort_pipeliner   o  s~     =	!:+;;IPXiw;x+;;)	 < 
 	  +"&%)"Or   c                    
 t        dt                      ddlm} t	         |      t               \  }} 
f	d}t        |	||      }t        |	||      } |        t        |
      }g }t        |      D ]  \  }}||k\  r nt        j                         }
r  d|d|j                  }n  d|gz  d|j                  }t        j                         }||z
  }|j                  |       t        d|dd	       t        |      D ]  \  }}|j                  | d
| d
| d       !  ddlm} d||t        |      t!        |      z  t#        j$                  |      ||dS )NzPipeline typer   )ORTFluxPipelinec            	         	 ry t               \  } }t        |
      }
r d| 	d| y  d| gz  	d| y )Nrj   rk   rl   rm   num_images_per_promptr   r   r   )rj   ro   r   ry   rV   rk   r   r;   rf   rW   r   rl   s      r   rp   z(run_optimum_ort_pipeline.<locals>.warmup  ss    )+1(<UW^`jk$ $)&1  u:-fE_duhtur   r   r   rq   rr   rs   r\   rt   ru   optimum_ortrw   r   )rS   type&optimum.onnxruntime.modeling_diffusionr   r   r   r   r   r   r   r   r   r   r.   rv   r   r   r   r   )r;   rV   re   rk   rl   rW   rz   ry   r   r   r   rf   r   r   r   rp   r   r   r   r   r   rj   r   r   r   r   r   r   r   r   s   `` ``` `  ``                 @r   run_optimum_ort_pipeliner     s    
/4:&F/G.0G_v v& **=v|T*+>U
H-o?XZacmnLLw'	6))+$ $)&0  f   x*,V5^cgsf  		/1G$}H56!&)HAuJJ/0!AaS=> *+ (0 7   ""|,s</@@$++L9/ 1 r   c                 p   t        j                          }t        | ||||      }t        j                          }t        d||z
   d       |r| dz   t        |      j                  z   n| }t        d||||      }t        ||||||||	|
||      }|j                  | ||j                  dd      |d	d
       |S )Nr   r   rs   r\   optimumr   r   r_   Fr   )	r   r   rS   r   namerd   r   r   rb   )r    r!   r&   rV   r"   rk   rl   rW   rz   ry   r   r   r   rf   r   r;   r   full_model_namere   r   s                       r   run_optimum_ortr     s      J#Ix)?P^D yy{H	: 56h
?@AJj3&i)=)==PZO5?J7M &F MM$" (()<bA&<!&	
 Mr   work_dirrx   max_batch_sizenvtx_profileuse_cuda_graphc                   - t        d       ddlm}  |        |k  sJ ddlm}  ||      }|j                         }ddlm}m} ddl	m
} |j                  } || ||      \  }}}}} ||d|d|||||		      --j                  j                  |||d
dddt        j                  j!                                -j#                         -fd}t%        |
||	      }t%        |
||	      } |        t'        d||      }g }t)               \  } }!t+        |       D ]  \  }"}#|"|k\  r nt-        j,                         }$-j/                  |#gz  |!gz  dd      \  }%}&t-        j,                         }'|'|$z
  }(|j1                  |(       t        d|(dd|&        t+        |%      D ]  \  })}*|*j3                  | d|" d|) d       !  -j5                          ddlm}+ ddlm}, i d|j=                         ddd|,dd|+ dd|d d!d"d#d$|d%|d&t?        |      tA        |      z  d'tC        jD                  |      d(|d)|d*|d+|S ),Nzd[I] Initializing ORT TensorRT EP accelerated StableDiffusionXL txt2img pipeline (static input shape)r   init_trt_pluginsPipelineInfo
EngineTypeget_engine_pathsrG   DDIMFr5   
output_dirverboser   r   r   framework_model_direngine_type   T)opt_image_heightopt_image_widthopt_batch_sizestatic_batchstatic_image_shapemax_workspace_size	device_idc                  \    t               \  } }j                  | gz  |gz         y N)denoising_stepsr   run)rj   ro   rV   rk   r   rW   rl   s     r   rp   z"run_ort_trt_static.<locals>.warmup]  s5    )+fX
*XJ,CVUdijr   ort_trtg      @r   r   guidanceseedEnd2End took rr    seconds. Inference latency: r\   rt   ru   r    rU   r.   rx   r&   z	tensorrt()r!   rk   rl   rW   rV   ry   rz   r{   r|   r}   r~   r"   r   )#rS   trt_utilitiesr   diffusion_modelsr   
short_nameengine_builderr   r   pipeline_stable_diffusionrG   ORT_TRTbackendbuild_enginesrJ   r   current_deviceload_resourcesr   rd   r   r   r   r   r   r   teardownr   rv   r.   r   r   r   r   r   ).r   rx   rV   r"   rk   rl   rW   rz   ry   r   r   r   r   r   r   r   pipeline_infor   r   r   rG   r   onnx_dir
engine_dirr   r   r\   rp   r   r   re   r   r   r   r   rj   r   r   pipeline_timer   r   r   r   trt_versionr   r   s.     ` ```                                      @r   run_ort_trt_staticr    s9     

pq /'''- )M))+J;A$$K?OPXZgit?u<Hj*&91 '!%%/
H ""
!**++- #   FE:6k k **=v|T*+>U
H5iZY^`vwL.0G_w'	6))+ (Hz!
*! !- !
 		/1G$gc]*GWX!&)HAuJJ/0!AaS=> *% (* 36m((*- 	; 	i}A.	
 	Z 	& 	 	 	j 	{ 	{ 	3|,s</@@ 	*++L9 	/ 	 1  	!"8!" 	^# r   c                   1 t        d       ddlm} ddlm}  |        |k  sJ ddlm}  ||      }ddlm}m	} ddl
m} |j                  } || ||      \  }}}}} ||d|d	||d
|      11j                  j                  |||dd
d
d	|       t        1j                  j!                         1j                  j!                               }|j#                  |      \  }}1j                  j%                  |       1j'                         1fd} t)        || |
      }!t)        || |
      }" |         t+        d||      }#g }$t-               \  }%}&t/        |%      D ]  \  }'}(|'|k\  r nt1        j0                         })1j3                  |(gz  |&gz  d      \  }*}+t1        j0                         },|,|)z
  }-|$j5                  |-       t        d|-dd|+        t/        |*      D ]  \  }.}/|/j7                  |# d|' d|. d       !  1j9                          dd l}0d|0j<                  d|	|t?        |$      tA        |$      z  tC        jD                  |$      |!|"|dS )N][I] Initializing TensorRT accelerated StableDiffusionXL txt2img pipeline (static input shape)r   cudartr   r   r   r   r   FT)r5   r   r   r   r   r   r   r   r  r   r  
onnx_opsetr   r   r   r   static_shapeenable_all_tacticstiming_cachec                  b    ry t               \  } }j                  | gz  |gz         y r   r   )rj   ro   rV   rk   r   rf   rW   rl   s     r   rp   z#run_tensorrt_static.<locals>.warmup  s;    )+fX
*XJ,CVUdijr   trtr   )r   r   r   rr   r   r\   rt   r   default)rU   rx   r&   rk   rl   rW   rV   ry   rz   r{   r|   r}   r~   r   )#rS   r   r  r   r   r   r   r   r   r   r   rG   TRTr   load_enginesmaxmax_device_memory
cudaMallocactivate_enginesr  r   rd   r   r   r   r   r   r   r  r   rv   r   r   r   r   )2r   rx   r    rV   r"   rk   rl   rW   rz   ry   r   r   r   r   r   rf   r  r   r   r  r   r   rG   r   r  r  r   r   r  r  r\   shared_device_memoryrp   r   r   re   r   r   r   r   rj   r   r   r  r   r   r   r   r  r   s2      ` ```       `                                 @r   run_tensorrt_staticr    s   $ 

ij /'''- )M;A..KJZ-KGHj*&9<
 '!%	H !!/! ! "  H,,>>@(BRBRBdBdBfg$//0ABA%%&:; FE:6k k **=v|T*+>U
H5eZUZ\rsL.0G_w'	6))+ (Hz!
*! !- !
 		/1G$gc]*GWX!&)HAuJJ/0!AaS=> *# ((  ?? ""|,s</@@$++L9/ 1+ r   c                    *+,-./ t        d       dd l}ddlm} ddlm} ,-,dz  dk7  s-dz  dk7  rt        d, d- d       |        k  sJ dd	lm} dd
l	m
*m+ *+ f	d}ddlm}  ||      } |||      .t        .j                  j!                         .j                  j!                               }|j#                  |      \  }}.j                  j%                  |       .j'                  ,-       d,-.fd	//fd}t)        |
||	      }t)        |
||	      } |        |j+                         }t-        d||      }g }t/               \  }} t1        |      D ]  \  }!}"|!|k\  r nt3        j2                         }# /|"gz  | gz  d      \  }$}%t3        j2                         }&|&|#z
  }'|j5                  |'       t        d|'dd|%        t1        |$      D ]  \  }(})|)j7                  | d|! d|( d       !  .j9                          |d|j:                  d||t=        |      t?        |      z  tA        jB                  |      ||dS )Nr
  r   r  r      zCImage height and width have to be divisible by 8 but specified as: z and .r   r   c                    	 	j                   } ||      \  }}}}} | |d|d||	      }|j                  j                  |||d
ddd|       |S )Nr   Fr   r   Tr  )r  r   r  )pipeline_classr  r   r  r  r   r   r  r   r   rV   r   rk   r   r   r   rl   r   s            r   init_pipelinez-run_tensorrt_static_xl.<locals>.init_pipelineK  s     nnN^m[O
K*j*=|
 "!%)) 3#

 	%%! 3%#!$% 	& 	
 r   r   c           	      4    j                  | |d|      S Ng      @r   r   )rj   r   r   image_heightimage_widthr   rW   s      r   run_sd_xl_inferencez3run_tensorrt_static_xl.<locals>.run_sd_xl_inference{  s.    ||!  
 	
r   c                  H    ry t               \  } } | gz  |gz         y Nrn   rj   ro   rV   r(  rf   s     r   rp   z&run_tensorrt_static_xl.<locals>.warmup  .    )+VHz1H:
3JKr   r  r   r   r   rr   r   r\   .pngr   r  r    rU   rx   r&   rk   rl   rW   rV   ry   rz   r{   r|   r}   r~   r   r*  )"rS   r   r   r  r   r   
ValueErrorr   r   r   r   r   r   rG   r  r   r  r  r  r  r   r   rd   r   r   r   r   r   r  rv   r   r   r   r   )0r   rx   rV   r"   rk   rl   rW   rz   ry   r   r   r   r   r   rf   r  r  r   r   r"  rG   r  r  r\   r  rp   r   r   r    re   r   r   r   r   rj   r   r   r  r   r   r   r   r   r   r&  r'  r   r(  s0   ` ` ```    ````                           @@@@@@r   run_tensorrt_static_xlr1  $  s   " 

ij. LKa1a1 4QR^Q__depdqqrs
 	

 '''-;! !F B )M4mDHH,,>>@(BRBRBdBdBfg$//0ABA%%&:; L+zB	
 	
L **=v|T*+>U
H##%J5eZUZ\rsL.0G_w'	6))+ 3VHz4IOK\_iKips t		/1G$gc]*GWX!&)HAuJJ/0!AaS=> * (  !?? ""|,s</@@$++L9/ 1+ r   c                   %& ddl m} ddlm}  |||j                  | ||      %|k  sJ %j                         d%fd	&&fd}t        |
||	      }t        |
||	      } |        %j                  j                         }t        d||      }g }t               \  }}t        |      D ]  \  }}||k\  r nt        j                         } &|gz  |gz  d	      \  }}t        j                         }||z
  }|j                  |       t        d
|dd|        t        |      D ]-  \  } }!| d| d|  d}"|!j                  |"       t        d|"       /  %j!                          ddlm}# ddlm}$ |d|$d|# d||t)        |      t+        |      z  t-        j.                  |      |||dS )Nr   )initialize_pipeline)r   )rx   r   r   rk   rl   r   r   r   c           	      4    j                  | |d|      S r$  r%  )rj   r   r   rk   r   rW   rl   s      r   r(  z+run_ort_trt_xl.<locals>.run_sd_xl_inference  s.    ||!  
 	
r   c                  H    ry t               \  } } | gz  |gz         y r*  rn   r+  s     r   rp   zrun_ort_trt_xl.<locals>.warmup  r,  r   r   r   r-  r   rr   r   r\   r.  zImage saved toru   r.   r   r   r/  r*  )
demo_utilsr3  r   r   r   r  r   r  r   rd   r   r   r   r   rS   r   r  r   rv   r.   r   r   r   r   )'r   rx   rV   r"   rk   rl   rW   rz   ry   r   r   r   r   r   rf   r3  r   rp   r   r   r    re   r   r   r   r   rj   r   r   r  r   r   r   r   filenamer  r   r   r(  s'     ` ```       `                      @@r   run_ort_trt_xlr8    s   " /)"&&%%!	H '''FE:6	
 	
L **=v|T*+>U
H'',,.J5iZY^`vwL.0G_w'	6))+ 3VHz4IOK\_iKips t		/1G$gc]*GWX!&)HAu/0!AaS=HJJx "H- * ( 36 !{m1- ""|,s</@@$++L9/ 1+ r   c                 H   dt         j                  j                  _        dt         j                  j                  _        t        j
                  d       t        j                         }t        | |||      }t        j                         }t        d||z
   d       t        d| |||      }|s4t        j                         5  t        ||||||||	|
||      }d d d        nt        ||||||||	|
||      }j                  | d |rdn|rdnd	|dd
       |S # 1 sw Y   *xY w)NTFr   rs   rJ   r   rN   xformersr  r   )rJ   backendscudnnenabled	benchmarkr   r   rT   rS   rd   inference_moder   r   )r    rV   r"   r=   r>   rk   rl   rW   rz   ry   r   r   rf   r   r;   r   re   r   s                     r   	run_torchr@  "  s7    $(ENN %)ENN"	5!Jj*@BVXdeDyy{H	: 56h
?@5gz:W\^tu!!#'%#'F $# $!#
 MM$%9	\z_h&<!&	
 MM $#s   DD!c                  x   t        j                         } | j                  dddt        dg dd       | j                  dd	dt        d
t	        t
        j                               d       | j                  dddd       | j                  dddt        t	        t        j                               dd       | j                  dddt        d d       | j                  dddt        dd       | j                  dddd        | j                  d!       | j                  d"ddd#        | j                  d$       | j                  d%ddd&        | j                  d'       | j                  d(ddd)        | j                  d*       | j                  d+ddd,        | j                  d-       | j                  d.d/t        d0g d1d23       | j                  d4dt        d5d6       | j                  d7dt        d5d8       | j                  d9d:dt        d;d<       | j                  d=d>dt        d?d@       | j                  dAdBdt        t        d0dC      dDdE       | j                  dFdGdt        t        d0dH      dIdJ       | j                  dKdLdddM        | j                  dN       | j                         }|S )ONz-ez--engineFr.   )r.   r   rJ   r   z-Engines to benchmark. Default is onnxruntime.)requiredr   r  choiceshelpz-rz
--providerr   z8Provider to benchmark. Default is CUDAExecutionProvider.z-tz--tuning
store_truezsEnable TunableOp and tuning. This will incur longer warmup latency, and is mandatory for some operators of ROCm EP.)actionrD  z-vz	--versionr   z>Stable diffusion version like 1.5, 2.0 or 2.1. Default is 1.5.)rB  r   rC  r  rD  z-pz
--pipelinez[Directory of saved onnx pipeline. It could be the output directory of optimize_pipeline.py.)rB  r   r  rD  z-wz
--work_dirr  z?Root directory to save exported onnx models, built engines etc.z--enable_safety_checkerzEnable safety checker)rB  rF  rD  )enable_safety_checkerz--enable_torch_compilez#Enable compile unet for PyTorch 2.0)r=   z--use_xformerszUse xformers for PyTorch)r>   z--use_io_bindingzUse I/O Binding for Optimum.r   z--skip_warmupz
No warmup.r   z-bz--batch_sizer   )r            r  
          z)Number of images per batch. Default is 1.)r   r  rC  rD  z--heighti   z$Output image height. Default is 512.z--widthz#Output image width. Default is 512.z-sz--steps2   zNumber of steps. Default is 50.z-nz--num_promptsrK  z!Number of prompts. Default is 10.z-cz--batch_count      z(Number of batches to test. Default is 5.z-mz--max_trt_batch_sizerL  rJ  zdMaximum batch size for TensorRT. Change the value may trigger TensorRT engine rebuild. Default is 4.z-gz--enable_cuda_graphz/Enable Cuda Graph. Requires onnxruntime >= 1.16)r   )argparseArgumentParseradd_argumentstrlist	PROVIDERSkeys	SD_MODELSset_defaultsintrange
parse_args)parserargss     r   parse_argumentsr_  g  sw   $$&F
?<   Y^^%&G   a	   Y^^%&M   j   N   !$	   e4
 2	   U3
'	   U+
+	   u-
	   E*
+8   3   2   .   0   a7   as   >   %0DKr   c                     dd l }|j                  t        j                               }|j	                         D ].  | rt        fddD              st        j                         0 y )Nr   c              3   :   K   | ]  }|j                   v   y wr*  )r0   ).0xlibs     r   	<genexpr>z)print_loaded_libraries.<locals>.<genexpr>  s     )`A_A!sxx-A_s   )libculibnvr   )psutilProcessr/   getpidmemory_mapsanyrS   r0   )cuda_related_onlyrh  prd  s      @r   print_loaded_librariesro    sF    ryy{#A}}!c)`A_)`&`#((O r   c                     t               } t        |        | j                  dk(  r| j                  dv rdt        j
                  d<   ddlm} ddlm} |j                  |      |j                  d      k(  rdt        j
                  d	<   | j                  rb| j                  dk(  r| j                  d
v r| j                  t        d      |j                  |      |j                  d      k  rt        d      t        j                  d       | j                  dk(  rdnd}t!        |d       }t        d|       t"        | j                     }t$        | j                     }| j                  dk(  rS| j                  dk(  rCd| j                  v rt        d       t'        | j(                  | j                  | j*                  d| j,                  | j.                  | j0                  | j2                  | j4                  ||| j6                  d| j                  | j8                        }nRt        d       t;        | j(                  | j                  | j*                  | j<                   | j,                  | j.                  | j0                  | j2                  | j4                  ||| j6                  d| j                  | j8                        }n| j                  dk(  r|dk(  rd| j                  v rdt        j
                  d	<   t?        || j                  || j*                  | j<                   | j,                  | j.                  | j0                  | j2                  | j4                  ||| j@                  | j8                        }n| j                  dk(  r| j                  r)t        jB                  jE                  | j                        sJ d       t        d| d| jF                          tI        || j                  || j*                  | j<                   | j,                  | j.                  | j0                  | j2                  | j4                  ||| jF                  | j8                         }n| j                  dk(  rd| j                  v rt        d!       tK        | j(                  | j                  | j*                  d| j,                  | j.                  | j0                  | j2                  | j4                  ||| j6                  d| j                  | j8                        }ne| j                  dk(  rt        d"       tM        d>i d#| j(                  d$| j                  d%|d&| j*                  d'dd(| j,                  d)| j.                  d*| j0                  d+| j2                  d,| j4                  d-|d.|d/| j6                  d0dd1| j                  d2| j8                  }nt        d3| jN                   d4| jP                   d5       tS        || j*                  | j<                   | jN                  | jP                  | j,                  | j.                  | j0                  | j2                  | j4                  ||| j8                  6      }t        |       tU        d7d8d9:      5 }g d;}	tW        jX                  ||	<      }
|
j[                          |
j]                  |       d d d        | j0                  d=k(  rt_        | j                  d
v        y y # 1 sw Y   1xY w)?Nr.   )r   1ORT_DISABLE_TRT_FLASH_ATTENTIONr   )rx   ru   z1.16.0!ORT_ENABLE_FUSED_CAUSAL_ATTENTION)r   r   z:The stable diffusion pipeline does not support CUDA graph.z1.16z.CUDA graph requires ONNX Runtime 1.16 or laterz%(funcName)20s: %(message)s)fmtr   r   z&GPU memory used before loading models:r   xlzNTesting Txt2ImgXLPipeline with static input shape. Backend is ORT TensorRT EP.TF)r   rx   rV   r"   rk   rl   rW   rz   ry   r   r   r   r   r   rf   zLTesting Txt2ImgPipeline with static input shape. Backend is ORT TensorRT EP.r   r   )r    r!   r&   rV   r"   rk   rl   rW   rz   ry   r   r   r   rf   z?--pipeline should be specified for the directory of ONNX modelsz/Testing diffusers StableDiffusionPipeline with z provider and tuning=)r    r!   r&   rV   r"   rk   rl   rW   rz   ry   r   r   r   rf   zGTesting Txt2ImgXLPipeline with static input shape. Backend is TensorRT.zETesting Txt2ImgPipeline with static input shape. Backend is TensorRT.r   rx   r    rV   r"   rk   rl   rW   rz   ry   r   r   r   r   r   rf   zNTesting Txt2ImgPipeline with dynamic input shape. Backend is PyTorch: compile=z, xformers=r  )r    rV   r"   r=   r>   rk   rl   rW   rz   ry   r   r   rf   zbenchmark_result.csvar_   )rD   newline)r    r!   rU   rx   r&   r"   rk   rl   rW   rV   ry   rz   r{   r|   r}   r~   r   )
fieldnamesr   r   )0r_  rS   rU   rx   r/   environ	packagingr.   rv   parser   r&   r   r0  coloredlogsinstallr   rX  rV  r8  r   rV   rk   rl   rW   rz   ry   max_trt_batch_sizerf   r  rG  r   r   r0   isdirr   r   r1  r  r=   r>   r@  opencsv
DictWriterwriteheaderwriterowro  )r^  rx   r   r   r   sd_modelr&   r   csv_filecolumn_names
csv_writers              r   mainr     sH   D	$K{{m#<<7" =@BJJ89%:==%x)@@ ?BBJJ:;!!KK=0T]]FZ5Z_c_l_l_t !]^^}}[)GMM&,AA !QRR9:$(MMV$;&%&94@L	
2LA&H'H{{m#(C4<<bc#??'+{{jjjj ,, ,,)$7#66"#55 ,,F$ `a'??+/+E+E'E{{jjjj ,, ,,)$7#66"#55 ,,F" 
		!h2I&I4<<>ABJJ:; mm'+'A'A#A;;****((((% 3..((
  
	%}}t}}!= 	
M	
= 	?zI^_c_j_j^klmmm'+'A'A#A;;****((((% 3;;((
  

	"tt||';WX']]LL#';;****((((% 32211((
" 

	"UV$ 
]]
LL
  
 	

 $(
 ;;
 **
 **
 ((
 ((
 &
 !4
  22
 
  11
  ((!
& 	\]a]v]v\w  xC  DH  DU  DU  CV  VW  X	
 '+'A'A#A!%!:!:**;;****((((% 3((
  
&M	$3	;x
& ^^HF
 F#- 
<2 zzQt}}0DDE 3 
<	;s   =[88\__main__r*  )F)r   TF)FF)FT)FTF)T)-rQ  r  r/   r   sysr   pathlibr   __init__r|  rJ   benchmark_helperr   rX  rV  r   r   r   rT  boolr<   rT   rZ  rd   r   dictr   r   r   r   r   r   r  r  r1  r8  r@  r_  ro  r  __name__	traceback	Exceptionprint_exceptionexc_infor   r   r   <module>r     s    
 	  
      + ,+-;=541-
	 $#++		$&h  X\ <*3 * *\` *pt *Zoc os o oTW oqu o  HH H HVcg < FF Fn 111 1 	1
 !1 1 1 1 1 1 1 1n %#'  !	
 L $WW WN !222 2 	2
 !2 2 2 2 2 2 2 2D @@@ @ !	@
 @ @ @ @ @ @ @ @b !EEE E 	E
 !E E E E E E E E E  !Ej SSS S !	S
 S S S S S S S SF eee e !	e
 e e e e e e e ej BBB !B 	B
 B B B B B B BJm`JFZ z3	 
  3!	!!<3<<>23s   F" "#GG