請教各位大大
我想使用onnxruntime-node執行vit-gpt2-image-captioning,在解碼的部份不知如何還原成文本,對文本模型不是很熟
這是我測試的code
// imageHelper.js
const { Tensor } = require("onnxruntime-node");
const sharp = require("sharp");
async function getImageTensorFromPath(
imagePath,
args = { width: 224, height: 224 }
) {
// Get buffer data from image
const imageData = await sharp(imagePath)
.resize(args.width, args.height)
.raw()
.removeAlpha()
.toBuffer({ resolveWithObject: true })
.then(({ data, info }) => data);
// Create R, G, and B arrays.
const [redArray, greenArray, blueArray] = new Array(
new Array(),
new Array(),
new Array()
);
// Loop through the image buffer and extract the R, G, and B channels
for (let i = 0; i < imageData.length; i += 3) {
redArray.push(imageData[i + 0]);
greenArray.push(imageData[i + 1]);
blueArray.push(imageData[i + 2]);
}
// Concatenate RGB to transpose [width, height, channels] -> [channels, width, height] to a number array
const transposedData = redArray.concat(greenArray).concat(blueArray);
// convert and normalize to float32
const float32Data = new Float32Array(3 * args.width * args.height);
for (i = 0; i < transposedData.length; i += 3) {
float32Data[i + 0] = (transposedData[i + 0] / 255.0 - 0.485) / 0.229;
float32Data[i + 1] = (transposedData[i + 1] / 255.0 - 0.456) / 0.224;
float32Data[i + 2] = (transposedData[i + 2] / 255.0 - 0.406) / 0.225;
}
// create the tensor object
const inputTensor = new Tensor("float32", float32Data, [
1,
3,
args.width,
args.height,
]);
return inputTensor;
}
module.exports = {
getImageTensorFromPath,
};
// main.js
const { InferenceSession, Tensor } = require("onnxruntime-node");
const path = require("path");
const fs = require("fs");
const { getImageTensorFromPath } = require("./imageHelper");
const encode_model_path = path.join(__dirname, "encoder_model.onnx");
const decode_model_path = path.join(__dirname, "decoder_model.onnx");
const image_path = path.join(__dirname, "test.png");
(async () => {
try {
const encoderSession = await InferenceSession.create(encode_model_path);
const decoderSession = await InferenceSession.create(decode_model_path);
const image_tensor = await getImageTensorFromPath(image_path);
const encoder_result = await encoderSession.run({
pixel_values: image_tensor,
});
const encoder_hidden_states = encoder_result.last_hidden_state;
const generated_ids = [50256];
const decoder_input = new Tensor(
"int64",
BigInt64Array.from(generated_ids.map((id) => BigInt(id))),
[1, generated_ids.length]
);
const decoder_result = await decoderSession.run({
input_ids: decoder_input,
encoder_hidden_states: encoder_hidden_states,
});
// 目前測試到這裡,可以正常看到decoder的參數,但不知如何轉換成文字
debugger;
} catch (error) {
console.error("錯誤:", error);
}
})();
測試執行
npm install
node main.js
或直接從這裡下載
有看過文本模型的解碼流程,但用js實際操作還是不知道怎麼處理
想請各位大大指教,謝謝