diff --git a/modules/call_queue.py b/modules/call_queue.py index b50931bcd..d22c23b31 100644 --- a/modules/call_queue.py +++ b/modules/call_queue.py @@ -1,8 +1,9 @@ +import os.path from functools import wraps import html import time -from modules import shared, progress, errors, devices, fifo_lock +from modules import shared, progress, errors, devices, fifo_lock, profiling queue_lock = fifo_lock.FIFOLock() @@ -111,8 +112,13 @@ def wrap_gradio_call(func, extra_outputs=None, add_stats=False): else: vram_html = '' + if shared.opts.profiling_enable and os.path.exists(shared.opts.profiling_filename): + profiling_html = f"

[ Profile ]

" + else: + profiling_html = '' + # last item is always HTML - res[-1] += f"

Time taken: {elapsed_text}

{vram_html}
" + res[-1] += f"

Time taken: {elapsed_text}

{vram_html}{profiling_html}
" return tuple(res) diff --git a/modules/processing.py b/modules/processing.py index 65e37db0a..91cb94db1 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -16,7 +16,7 @@ from skimage import exposure from typing import Any import modules.sd_hijack -from modules import devices, prompt_parser, masking, sd_samplers, lowvram, infotext_utils, extra_networks, sd_vae_approx, scripts, sd_samplers_common, sd_unet, errors, rng +from modules import devices, prompt_parser, masking, sd_samplers, lowvram, infotext_utils, extra_networks, sd_vae_approx, scripts, sd_samplers_common, sd_unet, errors, rng, profiling from modules.rng import slerp # noqa: F401 from modules.sd_hijack import model_hijack from modules.sd_samplers_common import images_tensor_to_samples, decode_first_stage, approximation_indexes @@ -843,7 +843,8 @@ def process_images(p: StableDiffusionProcessing) -> Processed: # backwards compatibility, fix sampler and scheduler if invalid sd_samplers.fix_p_invalid_sampler_and_scheduler(p) - res = process_images_inner(p) + with profiling.Profiler(): + res = process_images_inner(p) finally: sd_models.apply_token_merging(p.sd_model, 0) diff --git a/modules/profiling.py b/modules/profiling.py new file mode 100644 index 000000000..95b59f71a --- /dev/null +++ b/modules/profiling.py @@ -0,0 +1,46 @@ +import torch + +from modules import shared, ui_gradio_extensions + + +class Profiler: + def __init__(self): + if not shared.opts.profiling_enable: + self.profiler = None + return + + activities = [] + if "CPU" in shared.opts.profiling_activities: + activities.append(torch.profiler.ProfilerActivity.CPU) + if "CUDA" in shared.opts.profiling_activities: + activities.append(torch.profiler.ProfilerActivity.CUDA) + + if not activities: + self.profiler = None + return + + self.profiler = torch.profiler.profile( + activities=activities, + record_shapes=shared.opts.profiling_record_shapes, + profile_memory=shared.opts.profiling_profile_memory, + with_stack=shared.opts.profiling_with_stack + ) + + def __enter__(self): + if self.profiler: + self.profiler.__enter__() + + return self + + def __exit__(self, exc_type, exc, exc_tb): + if self.profiler: + shared.state.textinfo = "Finishing profile..." + + self.profiler.__exit__(exc_type, exc, exc_tb) + + self.profiler.export_chrome_trace(shared.opts.profiling_filename) + + +def webpath(): + return ui_gradio_extensions.webpath(shared.opts.profiling_filename) + diff --git a/modules/shared_options.py b/modules/shared_options.py index e2e02094f..104d8a544 100644 --- a/modules/shared_options.py +++ b/modules/shared_options.py @@ -129,6 +129,22 @@ options_templates.update(options_section(('system', "System", "system"), { "dump_stacks_on_signal": OptionInfo(False, "Print stack traces before exiting the program with ctrl+c."), })) +options_templates.update(options_section(('profiler', "Profiler", "system"), { + "profiling_explanation": OptionHTML(""" +Those settings allow you to enable torch profiler when generating pictures. +Profiling allows you to see which code uses how much of computer's resources during generation. +Each generation writes its own profile to one file, overwriting previous. +The file can be viewed in Chrome, or on a Perfetto web site. +Warning: writing profile can take a lot of time, up to 30 seconds, and the file itelf can be around 500MB in size. +"""), + "profiling_enable": OptionInfo(False, "Enable profiling"), + "profiling_activities": OptionInfo(["CPU"], "Activities", gr.CheckboxGroup, {"choices": ["CPU", "CUDA"]}), + "profiling_record_shapes": OptionInfo(True, "Record shapes"), + "profiling_profile_memory": OptionInfo(True, "Profile memory"), + "profiling_with_stack": OptionInfo(True, "Include python stack"), + "profiling_filename": OptionInfo("trace.json", "Profile filename"), +})) + options_templates.update(options_section(('API', "API", "system"), { "api_enable_requests": OptionInfo(True, "Allow http:// and https:// URLs for input images in API", restrict_api=True), "api_forbid_local_requests": OptionInfo(True, "Forbid URLs to local resources", restrict_api=True), diff --git a/style.css b/style.css index 467c29cdf..64ef61bad 100644 --- a/style.css +++ b/style.css @@ -279,7 +279,7 @@ input[type="checkbox"].input-accordion-checkbox{ display: inline-block; } -.html-log .performance p.time, .performance p.vram, .performance p.time abbr, .performance p.vram abbr { +.html-log .performance p.time, .performance p.vram, .performance p.profile, .performance p.time abbr, .performance p.vram abbr { margin-bottom: 0; color: var(--block-title-text-color); } @@ -291,6 +291,10 @@ input[type="checkbox"].input-accordion-checkbox{ margin-left: auto; } +.html-log .performance p.profile { + margin-left: 0.5em; +} + .html-log .performance .measurement{ color: var(--body-text-color); font-weight: bold;