使用vulkan进行推理时结果不正确

## error log | 日志或报错信息 | ログ

## context | 编译/运行环境 | バックグラウンド
windows11 
## how to reproduce | 复现步骤 | 再現方法
使用cpu进行推理时, 结果是正常的, 但是使用gpu推理时,返回的结果是错误的, 我将运行时的所有的blob输出, 结果发现经过第一个卷积层后输出就不一样了


## more | 其他 | その他
主要代码如下:

```c++

    bool useGpu = true;
    bool useDebugParam = true;

    SetConsoleOutputCP(CP_UTF8);
    LOG_I("开始 face detection test...");

    // 文件路径配置
    std::string param_path;
    if (useDebugParam) {
        param_path = R"(D:\tmp\ncnn_pytorch\face_detector.ncnn_debug.param)";
    } else {
        param_path = R"(D:\tmp\ncnn_pytorch\face_detector.ncnn.param)";
    }
    std::string bin_path = R"(D:\tmp\ncnn_pytorch\face_detector.ncnn.bin)";

    std::string original_img_path = R"(D:\tmp\image\o\face_image_1080_1920.png)";
    std::string padded_image_save_path = R"(D:\tmp\image\face_detector_ncnn_padded.png)"; // 你可以修改为所需路径

    std::string output_img_path = R"(D:\tmp\image\face_detector_ncnn.png)";
    std::string original_with_detection_output_img_path = R"(D:\tmp\image\face_detector_ncnn_with_original.png)";

    // 加载图像
    cv::Mat originalImg = cv::imread(original_img_path, cv::IMREAD_UNCHANGED);
    if (originalImg.empty()) {
        LOG_E("图片未找到: %s", original_img_path.c_str());
        return -1;
    }

    // 转换通道：如果图像有 4 通道，转换为 RGB；否则从 BGR 转换为 RGB
    if (originalImg.channels() == 4) {
        LOG_D("COLOR_BGRA2RGB");
        cv::cvtColor(originalImg, originalImg, cv::COLOR_BGRA2RGB);
    } else {
        LOG_D("COLOR_BGR2RGB");
        cv::cvtColor(originalImg, originalImg, cv::COLOR_BGR2RGB);
    }

    // 1. letterbox处理后得到 padded 图像，尺寸为 128x128，格式为 RGB
    PaddingParams padding_params{};
    cv::Mat padded = letterbox_padding(originalImg, cv::Size(128, 128), padding_params);

    ncnn::Mat mat_in;
    cv::Mat padded_float;

    if (useDebugParam) {
        mat_in = ncnn::Mat::from_pixels(padded.data, ncnn::Mat::PIXEL_RGB, padded.cols, padded.rows);
        const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
        mat_in.substract_mean_normalize(0, norm_vals);
        mat_in.dims = 4;
    }else {
        padded.convertTo(padded_float, CV_32FC3, 1.0 / 255.0);
        mat_in = ncnn::Mat(3, 128, 128, 1, padded_float.data);
    }
    print_ncnn_mat_shape(mat_in, "mat_in");

    ncnn::Net net;
    if (useGpu) {
        int gpu_count = ncnn::get_gpu_count();
        LOG_D("gpu_count:%d", gpu_count);
        if (gpu_count <= 0) {
            LOG_E("gpu_count<=0");
            return -1;
        }

        LOG_D("use_vulkan_compute");
        net.opt.use_vulkan_compute = true;

        // set specified vulkan device before loading param and model
        // net.set_vulkan_device(0); // use device-0

        net.opt.use_fp16_packed = false;
        net.opt.use_fp16_storage = false;
        net.opt.use_fp16_arithmetic = false;
        net.opt.use_int8_storage = false;
        net.opt.use_int8_arithmetic = false;
    }

    LOG_I("load_param: %s", param_path.c_str());
    if (net.load_param(param_path.c_str()) != 0) {
        LOG_E("加载 param 文件失败");
        return -1;
    }
    LOG_I("load_model: %s", bin_path.c_str());
    if (net.load_model(bin_path.c_str()) != 0) {
        LOG_E("加载 bin 文件失败");
        return -1;
    }

    ncnn::Extractor ex = net.create_extractor();
    // 设置输入节点名称为 "in0"
    LOG_D("ex.input");
    ex.input("in0", mat_in);

    // 执行推理，提取输出 "out0" 和 "out1"
    LOG_D("ex.extract");
    ncnn::Mat regressors, scores;
    ex.extract("out0", regressors);
    ex.extract("out1", scores);
    print_ncnn_mat_shape(regressors, "regressors");
    print_ncnn_mat_shape(scores, "scores");

    int num_regressors = regressors.w * regressors.h * regressors.c; // 896*16
    int num_scores = scores.w * scores.h * scores.c; // 896

    std::vector<float> reg_vec((float *) regressors.data, (float *) regressors.data + num_regressors);
    std::vector<float> score_vec((float *) scores.data, (float *) scores.data + num_scores);

    // 对 score_vec 执行 clip(-100,100) 并计算 sigmoid
    for (auto &s: score_vec) {
        if (s < -100.0f) s = -100.0f;
        if (s > 100.0f) s = 100.0f;
        s = 1.0f / (1.0f + std::exp(-s));
    }
    // 找到最大分数索引
    int max_index = std::distance(score_vec.begin(), std::max_element(score_vec.begin(), score_vec.end()));
    float max_score = score_vec[max_index];
    LOG_I("最大分数: %.4f, 索引: %d", max_score, max_index);

```

通过flag useGpu 切换使用cpu/gpu 推理
bool useGpu = true;
通过flag useDebugParam 切换是否使用手动调整过的param
bool useDebugParam = true;

模型是使用pnnx将onnx转换成的ncnn模型, 
pnnx输出的模型转换输入:
```
Input                    in0                      0 1 in0
Permute                  permute_56               1 1 in0 1 0=4
```

手动调整一下可以传入常规的shape的tensor
```
Input                    in0                      0 1 in0
Permute                  permute_56               1 1 in0 1 0=6
```

区别是 permute 参数 type 修改


现在的现象是:
当 useGpu = false 时, useDebugParam 为 true/false 都可以正常输出
当 useGpu = true 时, useDebugParam 为 true/false 都可以输出, 但是数值是错误的

完整的项目见附件
[ncnn-test.zip](https://github.com/user-attachments/files/18862036/ncnn-test.zip)

输出的blob部分如下, 前2个blob, 使用cpu和gpu时完全一致, 第三个blob开始产生区别
[blob.zip](https://github.com/user-attachments/files/18828276/blob.zip)



Provide feedback

Saved searches

Use saved searches to filter your results more quickly

使用vulkan进行推理时结果不正确 #5909

error log | 日志或报错信息 | ログ

context | 编译/运行环境 | バックグラウンド

how to reproduce | 复现步骤 | 再現方法

more | 其他 | その他

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development