drm/amdgpu: trigger flr_work if reading pf2vf data failed
[ Upstream commit ab66c832847fcdffc97d4591ba5547e3990d9d33 ] if reading pf2vf data failed 30 times continuously, it means something is wrong. Need to trigger flr_work to recover the issue. also use dev_err to print the error message to get which device has issue and add warning message if waiting IDH_FLR_NOTIFICATION_CMPL timeout. Signed-off-by: Zhigang Luo <Zhigang.Luo@amd.com> Acked-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Stable-dep-of: d0ce1aaa8531 ("Revert "drm/amd: Stop evicting resources on APUs in suspend"") Signed-off-by: Sasha Levin <sashal@kernel.org>
This commit is contained in:
committed by
Greg Kroah-Hartman
parent
9a6d2e1944
commit
d59f455951
@@ -141,6 +141,8 @@ const char *amdgpu_asic_name[] = {
|
|||||||
"LAST",
|
"LAST",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* DOC: pcie_replay_count
|
* DOC: pcie_replay_count
|
||||||
*
|
*
|
||||||
@@ -4558,6 +4560,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
|
|||||||
retry:
|
retry:
|
||||||
amdgpu_amdkfd_pre_reset(adev);
|
amdgpu_amdkfd_pre_reset(adev);
|
||||||
|
|
||||||
|
amdgpu_device_stop_pending_resets(adev);
|
||||||
|
|
||||||
if (from_hypervisor)
|
if (from_hypervisor)
|
||||||
r = amdgpu_virt_request_full_gpu(adev, true);
|
r = amdgpu_virt_request_full_gpu(adev, true);
|
||||||
else
|
else
|
||||||
@@ -5354,11 +5358,12 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
|||||||
tmp_adev->asic_reset_res = r;
|
tmp_adev->asic_reset_res = r;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
if (!amdgpu_sriov_vf(tmp_adev))
|
||||||
* Drop all pending non scheduler resets. Scheduler resets
|
/*
|
||||||
* were already dropped during drm_sched_stop
|
* Drop all pending non scheduler resets. Scheduler resets
|
||||||
*/
|
* were already dropped during drm_sched_stop
|
||||||
amdgpu_device_stop_pending_resets(tmp_adev);
|
*/
|
||||||
|
amdgpu_device_stop_pending_resets(tmp_adev);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Actual ASIC resets if needed.*/
|
/* Actual ASIC resets if needed.*/
|
||||||
|
@@ -32,6 +32,7 @@
|
|||||||
|
|
||||||
#include "amdgpu.h"
|
#include "amdgpu.h"
|
||||||
#include "amdgpu_ras.h"
|
#include "amdgpu_ras.h"
|
||||||
|
#include "amdgpu_reset.h"
|
||||||
#include "vi.h"
|
#include "vi.h"
|
||||||
#include "soc15.h"
|
#include "soc15.h"
|
||||||
#include "nv.h"
|
#include "nv.h"
|
||||||
@@ -468,7 +469,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
|
|||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
if (pf2vf_info->size > 1024) {
|
if (pf2vf_info->size > 1024) {
|
||||||
DRM_ERROR("invalid pf2vf message size\n");
|
dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n", pf2vf_info->size);
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -479,7 +480,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
|
|||||||
adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
|
adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
|
||||||
adev->virt.fw_reserve.checksum_key, checksum);
|
adev->virt.fw_reserve.checksum_key, checksum);
|
||||||
if (checksum != checkval) {
|
if (checksum != checkval) {
|
||||||
DRM_ERROR("invalid pf2vf message\n");
|
dev_err(adev->dev,
|
||||||
|
"invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n",
|
||||||
|
checksum, checkval);
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -493,7 +496,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
|
|||||||
adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
|
adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
|
||||||
0, checksum);
|
0, checksum);
|
||||||
if (checksum != checkval) {
|
if (checksum != checkval) {
|
||||||
DRM_ERROR("invalid pf2vf message\n");
|
dev_err(adev->dev,
|
||||||
|
"invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n",
|
||||||
|
checksum, checkval);
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -529,7 +534,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
|
|||||||
((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid;
|
((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
DRM_ERROR("invalid pf2vf version\n");
|
dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version);
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -628,8 +633,21 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
|
|||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
ret = amdgpu_virt_read_pf2vf_data(adev);
|
ret = amdgpu_virt_read_pf2vf_data(adev);
|
||||||
if (ret)
|
if (ret) {
|
||||||
|
adev->virt.vf2pf_update_retry_cnt++;
|
||||||
|
if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
|
||||||
|
amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) {
|
||||||
|
if (amdgpu_reset_domain_schedule(adev->reset_domain,
|
||||||
|
&adev->virt.flr_work))
|
||||||
|
return;
|
||||||
|
else
|
||||||
|
dev_err(adev->dev, "Failed to queue work! at %s", __func__);
|
||||||
|
}
|
||||||
|
|
||||||
goto out;
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
adev->virt.vf2pf_update_retry_cnt = 0;
|
||||||
amdgpu_virt_write_vf2pf_data(adev);
|
amdgpu_virt_write_vf2pf_data(adev);
|
||||||
|
|
||||||
out:
|
out:
|
||||||
@@ -650,6 +668,7 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev)
|
|||||||
adev->virt.fw_reserve.p_pf2vf = NULL;
|
adev->virt.fw_reserve.p_pf2vf = NULL;
|
||||||
adev->virt.fw_reserve.p_vf2pf = NULL;
|
adev->virt.fw_reserve.p_vf2pf = NULL;
|
||||||
adev->virt.vf2pf_update_interval_ms = 0;
|
adev->virt.vf2pf_update_interval_ms = 0;
|
||||||
|
adev->virt.vf2pf_update_retry_cnt = 0;
|
||||||
|
|
||||||
if (adev->mman.fw_vram_usage_va && adev->mman.drv_vram_usage_va) {
|
if (adev->mman.fw_vram_usage_va && adev->mman.drv_vram_usage_va) {
|
||||||
DRM_WARN("Currently fw_vram and drv_vram should not have values at the same time!");
|
DRM_WARN("Currently fw_vram and drv_vram should not have values at the same time!");
|
||||||
|
@@ -51,6 +51,8 @@
|
|||||||
/* tonga/fiji use this offset */
|
/* tonga/fiji use this offset */
|
||||||
#define mmBIF_IOV_FUNC_IDENTIFIER 0x1503
|
#define mmBIF_IOV_FUNC_IDENTIFIER 0x1503
|
||||||
|
|
||||||
|
#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 30
|
||||||
|
|
||||||
enum amdgpu_sriov_vf_mode {
|
enum amdgpu_sriov_vf_mode {
|
||||||
SRIOV_VF_MODE_BARE_METAL = 0,
|
SRIOV_VF_MODE_BARE_METAL = 0,
|
||||||
SRIOV_VF_MODE_ONE_VF,
|
SRIOV_VF_MODE_ONE_VF,
|
||||||
@@ -253,6 +255,7 @@ struct amdgpu_virt {
|
|||||||
/* vf2pf message */
|
/* vf2pf message */
|
||||||
struct delayed_work vf2pf_work;
|
struct delayed_work vf2pf_work;
|
||||||
uint32_t vf2pf_update_interval_ms;
|
uint32_t vf2pf_update_interval_ms;
|
||||||
|
int vf2pf_update_retry_cnt;
|
||||||
|
|
||||||
/* multimedia bandwidth config */
|
/* multimedia bandwidth config */
|
||||||
bool is_mm_bw_enabled;
|
bool is_mm_bw_enabled;
|
||||||
|
@@ -276,6 +276,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
|
|||||||
timeout -= 10;
|
timeout -= 10;
|
||||||
} while (timeout > 1);
|
} while (timeout > 1);
|
||||||
|
|
||||||
|
dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
|
||||||
|
|
||||||
flr_done:
|
flr_done:
|
||||||
atomic_set(&adev->reset_domain->in_gpu_reset, 0);
|
atomic_set(&adev->reset_domain->in_gpu_reset, 0);
|
||||||
up_write(&adev->reset_domain->sem);
|
up_write(&adev->reset_domain->sem);
|
||||||
|
@@ -298,6 +298,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
|
|||||||
timeout -= 10;
|
timeout -= 10;
|
||||||
} while (timeout > 1);
|
} while (timeout > 1);
|
||||||
|
|
||||||
|
dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
|
||||||
|
|
||||||
flr_done:
|
flr_done:
|
||||||
atomic_set(&adev->reset_domain->in_gpu_reset, 0);
|
atomic_set(&adev->reset_domain->in_gpu_reset, 0);
|
||||||
up_write(&adev->reset_domain->sem);
|
up_write(&adev->reset_domain->sem);
|
||||||
|
Reference in New Issue
Block a user