run basic torch calculation at startup in parallel to reduce the performance impact of first generation

2024-11-27 06:40:10 +08:00 · 2023-05-21 21:55:14 +03:00 · 2023-05-21 21:55:14 +03:00 · 8faac8b963
commit 8faac8b963
parent 1f3182924b
2 changed files with 21 additions and 1 deletions
--- a/modules/devices.py
+++ b/modules/devices.py
@ -1,5 +1,7 @@
 import sys
 import contextlib
+from functools import lru_cache
+
 import torch
 from modules import errors

@ -154,3 +156,19 @@ def test_for_nans(x, where):
    message += " Use --disable-nan-check commandline argument to disable this check."

    raise NansException(message)
+
+
+@lru_cache
+def first_time_calculation():
+    """
+    just do any calculation with pytorch layers - the first time this is done it allocaltes about 700MB of memory and
+    spends about 2.7 seconds doing that, at least wih NVidia.
+    """
+
+    x = torch.zeros((1, 1)).to(device, dtype)
+    linear = torch.nn.Linear(1, 1).to(device, dtype)
+    linear(x)
+
+    x = torch.zeros((1, 1, 3, 3)).to(device, dtype)
+    conv2d = torch.nn.Conv2d(1, 1, (3, 3)).to(device, dtype)
+    conv2d(x)
--- a/webui.py
+++ b/webui.py
@ -20,7 +20,7 @@ import logging

 logging.getLogger("xformers").addFilter(lambda record: 'A matching Triton is not available' not in record.getMessage())

-from modules import paths, timer, import_hook, errors  # noqa: F401
+from modules import paths, timer, import_hook, errors, devices  # noqa: F401

 startup_timer = timer.Timer()

@ -295,6 +295,8 @@ def initialize_rest(*, reload_script_modules=False):
    # (when reloading, this does nothing)
    Thread(target=lambda: shared.sd_model).start()

+    Thread(target=devices.first_time_calculation).start()
+
    shared.reload_hypernetworks()
    startup_timer.record("reload hypernetworks")