From f501292d787a43569bc17526be89a0e87e21cc64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Leonardo=20Eug=C3=AAnio?= Date: Sun, 11 Dec 2022 20:13:35 -0300 Subject: [PATCH] Patch kernel to not shutdown on gpu overheating, the detection is wonky --- flake.nix | 1 + ...dgpu-disable-shutdown-on-overtheating.diff | 44 +++++++++++++++++++ system/amdgpu-kernel-patches.nix | 10 +++++ 3 files changed, 55 insertions(+) create mode 100644 patches/kernel/amdgpu-disable-shutdown-on-overtheating.diff create mode 100644 system/amdgpu-kernel-patches.nix diff --git a/flake.nix b/flake.nix index 4689792..84166de 100644 --- a/flake.nix +++ b/flake.nix @@ -83,6 +83,7 @@ ./hosts/monolith.nix ./system/gitlab-runner.nix ./system/btusb-kernel-patches.nix + ./system/amdgpu-kernel-patches.nix ] ++ common_modules; }; rainbow = lib.nixosSystem { diff --git a/patches/kernel/amdgpu-disable-shutdown-on-overtheating.diff b/patches/kernel/amdgpu-disable-shutdown-on-overtheating.diff new file mode 100644 index 0000000..05908c3 --- /dev/null +++ b/patches/kernel/amdgpu-disable-shutdown-on-overtheating.diff @@ -0,0 +1,44 @@ +diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c +index bfe80ac0a..5343b8b86 100644 +--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c ++++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c +@@ -614,7 +614,6 @@ int phm_irq_process(struct amdgpu_device *adev, + * Try to do a graceful shutdown to prevent further damage. + */ + dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); +- orderly_poweroff(true); + } else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW) + dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n"); + else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) { +@@ -633,7 +632,6 @@ int phm_irq_process(struct amdgpu_device *adev, + * Try to do a graceful shutdown to prevent further damage. + */ + dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); +- orderly_poweroff(true); + } else + dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n"); + } else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) { +diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c +index 70b560737..11373a474 100644 +--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c ++++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c +@@ -1444,7 +1444,6 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev, + * Try to do a graceful shutdown to prevent further damage. + */ + dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); +- orderly_poweroff(true); + break; + case THM_11_0__SRCID__THM_DIG_THERM_H2L: + dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n"); +diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +index 89f0f6eb1..99024cfec 100644 +--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c ++++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +@@ -1386,7 +1386,6 @@ static int smu_v13_0_irq_process(struct amdgpu_device *adev, + * Try to do a graceful shutdown to prevent further damage. + */ + dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); +- orderly_poweroff(true); + break; + case THM_11_0__SRCID__THM_DIG_THERM_H2L: + dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n"); diff --git a/system/amdgpu-kernel-patches.nix b/system/amdgpu-kernel-patches.nix new file mode 100644 index 0000000..22e26a3 --- /dev/null +++ b/system/amdgpu-kernel-patches.nix @@ -0,0 +1,10 @@ +{ config, pkgs, lib, inputs, ... }: { + # boot.kernelPackages = lib.mkDefault pkgs.linuxPackages_latest; + boot.kernelPatches = [ + { + name = "amdgpu-disable-shutdown-on-overtheating"; + patch = + ../patches/kernel/amdgpu-disable-shutdown-on-overtheating.diff; + } + ]; +}