Patch kernel to not shutdown on gpu overheating, the detection is wonky
This commit is contained in:
		
							parent
							
								
									c20dd2afd1
								
							
						
					
					
						commit
						f501292d78
					
				
					 3 changed files with 55 additions and 0 deletions
				
			
		| 
						 | 
					@ -83,6 +83,7 @@
 | 
				
			||||||
            ./hosts/monolith.nix
 | 
					            ./hosts/monolith.nix
 | 
				
			||||||
            ./system/gitlab-runner.nix
 | 
					            ./system/gitlab-runner.nix
 | 
				
			||||||
            ./system/btusb-kernel-patches.nix
 | 
					            ./system/btusb-kernel-patches.nix
 | 
				
			||||||
 | 
					            ./system/amdgpu-kernel-patches.nix
 | 
				
			||||||
          ] ++ common_modules;
 | 
					          ] ++ common_modules;
 | 
				
			||||||
        };
 | 
					        };
 | 
				
			||||||
        rainbow = lib.nixosSystem {
 | 
					        rainbow = lib.nixosSystem {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										44
									
								
								patches/kernel/amdgpu-disable-shutdown-on-overtheating.diff
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								patches/kernel/amdgpu-disable-shutdown-on-overtheating.diff
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,44 @@
 | 
				
			||||||
 | 
					diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
 | 
				
			||||||
 | 
					index bfe80ac0a..5343b8b86 100644
 | 
				
			||||||
 | 
					--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
 | 
				
			||||||
 | 
					+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
 | 
				
			||||||
 | 
					@@ -614,7 +614,6 @@ int phm_irq_process(struct amdgpu_device *adev,
 | 
				
			||||||
 | 
					 			 * Try to do a graceful shutdown to prevent further damage.
 | 
				
			||||||
 | 
					 			 */
 | 
				
			||||||
 | 
					 			dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
 | 
				
			||||||
 | 
					-			orderly_poweroff(true);
 | 
				
			||||||
 | 
					 		} else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW)
 | 
				
			||||||
 | 
					 			dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
 | 
				
			||||||
 | 
					 		else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
 | 
				
			||||||
 | 
					@@ -633,7 +632,6 @@ int phm_irq_process(struct amdgpu_device *adev,
 | 
				
			||||||
 | 
					 			 * Try to do a graceful shutdown to prevent further damage.
 | 
				
			||||||
 | 
					 			 */
 | 
				
			||||||
 | 
					 			dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
 | 
				
			||||||
 | 
					-			orderly_poweroff(true);
 | 
				
			||||||
 | 
					 		} else
 | 
				
			||||||
 | 
					 			dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
 | 
				
			||||||
 | 
					 	} else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) {
 | 
				
			||||||
 | 
					diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
 | 
				
			||||||
 | 
					index 70b560737..11373a474 100644
 | 
				
			||||||
 | 
					--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
 | 
				
			||||||
 | 
					+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
 | 
				
			||||||
 | 
					@@ -1444,7 +1444,6 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
 | 
				
			||||||
 | 
					 			 * Try to do a graceful shutdown to prevent further damage.
 | 
				
			||||||
 | 
					 			 */
 | 
				
			||||||
 | 
					 			dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
 | 
				
			||||||
 | 
					-			orderly_poweroff(true);
 | 
				
			||||||
 | 
					 		break;
 | 
				
			||||||
 | 
					 		case THM_11_0__SRCID__THM_DIG_THERM_H2L:
 | 
				
			||||||
 | 
					 			dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");
 | 
				
			||||||
 | 
					diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
 | 
				
			||||||
 | 
					index 89f0f6eb1..99024cfec 100644
 | 
				
			||||||
 | 
					--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
 | 
				
			||||||
 | 
					+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
 | 
				
			||||||
 | 
					@@ -1386,7 +1386,6 @@ static int smu_v13_0_irq_process(struct amdgpu_device *adev,
 | 
				
			||||||
 | 
					 			 * Try to do a graceful shutdown to prevent further damage.
 | 
				
			||||||
 | 
					 			 */
 | 
				
			||||||
 | 
					 			dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
 | 
				
			||||||
 | 
					-			orderly_poweroff(true);
 | 
				
			||||||
 | 
					 			break;
 | 
				
			||||||
 | 
					 		case THM_11_0__SRCID__THM_DIG_THERM_H2L:
 | 
				
			||||||
 | 
					 			dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");
 | 
				
			||||||
							
								
								
									
										10
									
								
								system/amdgpu-kernel-patches.nix
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								system/amdgpu-kernel-patches.nix
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,10 @@
 | 
				
			||||||
 | 
					{ config, pkgs, lib, inputs, ... }: {
 | 
				
			||||||
 | 
					  # boot.kernelPackages = lib.mkDefault pkgs.linuxPackages_latest;
 | 
				
			||||||
 | 
					  boot.kernelPatches = [
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      name = "amdgpu-disable-shutdown-on-overtheating";
 | 
				
			||||||
 | 
					      patch =
 | 
				
			||||||
 | 
					        ../patches/kernel/amdgpu-disable-shutdown-on-overtheating.diff;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  ];
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue