Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for visualizing self-attention heatmaps + sequence classifier outputs w/ attentions #1117

Merged
merged 14 commits into from
Dec 28, 2024
Merged
46 changes: 31 additions & 15 deletions src/models.js
Original file line number Diff line number Diff line change
Expand Up @@ -4463,6 +4463,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
const image_nums = vision_tokens.filter(x => x == image_token_id).length;
const video_nums = vision_tokens.filter(x => x == video_token_id).length;

/** @type {number[][]} */
let llm_pos_ids_list = [];
let st = 0;
let remain_images = image_nums;
Expand Down Expand Up @@ -4532,6 +4533,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
// NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
// meaning to perform concatenation along dim=1, we can do the following:
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
/** @type {number[]} */
const llm_positions = new Array(num_items);
let index = 0;
for (let x = 0; x < 3; ++x) {
Expand Down Expand Up @@ -4572,9 +4574,10 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
{ length: 3 * data.length },
(_, i) => data[i % data.length]
);
/** @type {bigint[]} */
const mrope_position_deltas = Array.from(
{ length: dims[0] },
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1 + dims[1]
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
);

return [
Expand Down Expand Up @@ -5145,7 +5148,7 @@ export class DPTModel extends DPTPreTrainedModel { }
*
* **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
* ```javascript
* import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
* import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
*
* // Load model and processor
* const model_id = 'Xenova/dpt-hybrid-midas';
Expand All @@ -5154,7 +5157,7 @@ export class DPTModel extends DPTPreTrainedModel { }
*
* // Load image from URL
* const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
* const image = await RawImage.fromURL(url);
* const image = await RawImage.read(url);
*
* // Prepare image for the model
* const inputs = await processor(image);
Expand All @@ -5163,10 +5166,15 @@ export class DPTModel extends DPTPreTrainedModel { }
* const { predicted_depth } = await model(inputs);
*
* // Interpolate to original size
* const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
* const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
* size: image.size.reverse(),
* mode: 'bilinear',
* })).squeeze(1);
*
* // Visualize the prediction
* const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
* const min = prediction.min().item();
* const max = prediction.max().item();
* const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
* const depth = RawImage.fromTensor(formatted);
* // RawImage {
* // data: Uint8Array(307200) [ 85, 85, 84, ... ],
Expand Down Expand Up @@ -5216,11 +5224,7 @@ export class GLPNPreTrainedModel extends PreTrainedModel { }
export class GLPNModel extends GLPNPreTrainedModel { }

/**
* GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
*
* **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
* ```javascript
* import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
* import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
*
* // Load model and processor
* const model_id = 'Xenova/glpn-kitti';
Expand All @@ -5229,7 +5233,7 @@ export class GLPNModel extends GLPNPreTrainedModel { }
*
* // Load image from URL
* const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
* const image = await RawImage.fromURL(url);
* const image = await RawImage.read(url);
*
* // Prepare image for the model
* const inputs = await processor(image);
Expand All @@ -5238,13 +5242,18 @@ export class GLPNModel extends GLPNPreTrainedModel { }
* const { predicted_depth } = await model(inputs);
*
* // Interpolate to original size
* const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
* const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
* size: image.size.reverse(),
* mode: 'bilinear',
* })).squeeze(1);
*
* // Visualize the prediction
* const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
* const min = prediction.min().item();
* const max = prediction.max().item();
* const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
* const depth = RawImage.fromTensor(formatted);
* // RawImage {
* // data: Uint8Array(307200) [ 207, 169, 154, ... ],
* // data: Uint8Array(307200) [ 85, 85, 84, ... ],
* // width: 640,
* // height: 480,
* // channels: 1
Expand Down Expand Up @@ -7747,10 +7756,17 @@ export class SequenceClassifierOutput extends ModelOutput {
/**
* @param {Object} output The output of the model.
* @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
* @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
* Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
*/
constructor({ logits }) {
constructor({ logits, ...attentions }) {
super();
this.logits = logits;
const attentions_list = Object.values(attentions);
if (attentions_list.length > 0) {
// Only set attentions if they are not empty
this.attentions = attentions_list;
}
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/models/idefics3/image_processing_idefics3.js
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ export class Idefics3ImageProcessor extends ImageProcessor {

const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
const end_offset = (i + 1) * pixel_attention_mask_stride;

// @ts-expect-error
pixel_attention_mask_data.fill(false, start_offset, end_offset);
}
}
Expand Down
1 change: 1 addition & 0 deletions src/models/pyannote/feature_extraction_pyannote.js
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ export class PyAnnoteFeatureExtractor extends FeatureExtractor {

let current_speaker = -1;
for (let i = 0; i < scores.length; ++i) {
/** @type {number[]} */
const probabilities = softmax(scores[i]);
const [score, id] = max(probabilities);
const [start, end] = [i, i + 1];
Expand Down
4 changes: 2 additions & 2 deletions src/models/seamless_m4t/feature_extraction_seamless_m4t.js
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ export class SeamlessM4TFeatureExtractor extends FeatureExtractor {
'int64',
new BigInt64Array(numPaddedFrames),
[1, numPaddedFrames],
)
padded_attention_mask.data.fill(1n, 0, num_frames);
);
/** @type {BigInt64Array} */ (padded_attention_mask.data).fill(1n, 0, num_frames);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/models/whisper/feature_extraction_whisper.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
)

const data = features.data;
const maxValue = max(data)[0];
const maxValue = max(/** @type {Float32Array} */(data))[0];

for (let i = 0; i < data.length; ++i) {
data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
Expand Down
10 changes: 10 additions & 0 deletions src/ops/registry.js
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@ export class TensorOpRegistry {
// executionProviders: ['webgpu'],
};

static get nearest_interpolate_4d() {
if (!this._nearest_interpolate_4d) {
this._nearest_interpolate_4d = wrap(
[8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
this.session_options,
'y',
);
}
return this._nearest_interpolate_4d;
}
static get bilinear_interpolate_4d() {
if (!this._bilinear_interpolate_4d) {
this._bilinear_interpolate_4d = wrap(
Expand Down
22 changes: 17 additions & 5 deletions src/pipelines.js
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ import {
import {
Tensor,
mean_pooling,
interpolate,
interpolate_4d,
quantize_embeddings,
topk,
} from './utils/tensor.js';
Expand Down Expand Up @@ -2901,11 +2901,23 @@ export class DepthEstimationPipeline extends (/** @type {new (options: ImagePipe

const toReturn = [];
for (let i = 0; i < preparedImages.length; ++i) {
const prediction = interpolate(predicted_depth[i], preparedImages[i].size.reverse(), 'bilinear', false);
const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
const batch = predicted_depth[i];
const [height, width] = batch.dims.slice(-2);
const [new_width, new_height] = preparedImages[i].size;

// Interpolate to original size
const prediction = (await interpolate_4d(batch.view(1, 1, height, width), {
size: [new_height, new_width],
mode: 'bilinear',
})).view(new_height, new_width);

const minval = /** @type {number} */(prediction.min().item());
const maxval = /** @type {number} */(prediction.max().item());
const formatted = prediction.sub(minval).div_(maxval - minval).mul_(255).to('uint8').unsqueeze(0);
const depth = RawImage.fromTensor(formatted);
toReturn.push({
predicted_depth: predicted_depth[i],
depth: RawImage.fromTensor(formatted),
predicted_depth: prediction,
depth,
});
}

Expand Down
7 changes: 3 additions & 4 deletions src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -533,19 +533,18 @@ class Unigram extends TokenizerModel {
* Create a new Unigram tokenizer model.
* @param {Object} config The configuration object for the Unigram model.
* @param {number} config.unk_id The ID of the unknown token
* @param {any[][]} config.vocab A 2D array representing a mapping of tokens to scores.
* @param {[string, number][]} config.vocab A 2D array representing a mapping of tokens to scores.
* @param {Object} moreConfig Additional configuration object for the Unigram model.
*/
constructor(config, moreConfig) {
super(config);

const vocabSize = config.vocab.length;
this.vocab = new Array(vocabSize);
/** @type {number[]} */
this.scores = new Array(vocabSize);
for (let i = 0; i < vocabSize; ++i) {
const piece = config.vocab[i];
this.vocab[i] = piece[0];
this.scores[i] = piece[1];
[this.vocab[i], this.scores[i]] = config.vocab[i];
}

this.unk_token_id = config.unk_id;
Expand Down
14 changes: 8 additions & 6 deletions src/utils/maths.js
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,9 @@ export function magnitude(arr) {

/**
* Returns the value and index of the minimum element in an array.
* @param {number[]|TypedArray} arr array of numbers.
* @returns {[number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
* @template {number[]|bigint[]|AnyTypedArray} T
* @param {T} arr array of numbers.
* @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
* @throws {Error} If array is empty.
*/
export function min(arr) {
Expand All @@ -239,14 +240,15 @@ export function min(arr) {
indexOfMin = i;
}
}
return [min, indexOfMin];
return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([min, indexOfMin]);
}


/**
* Returns the value and index of the maximum element in an array.
* @param {number[]|AnyTypedArray} arr array of numbers.
* @returns {[number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
* @template {number[]|bigint[]|AnyTypedArray} T
* @param {T} arr array of numbers.
* @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
* @throws {Error} If array is empty.
*/
export function max(arr) {
Expand All @@ -259,7 +261,7 @@ export function max(arr) {
indexOfMax = i;
}
}
return [Number(max), indexOfMax];
return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([max, indexOfMax]);
}

function isPowerOfTwo(number) {
Expand Down
52 changes: 42 additions & 10 deletions src/utils/tensor.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

import {
interpolate_data,
max,
min,
permute_data
} from './maths.js';

Expand Down Expand Up @@ -464,8 +466,6 @@ export class Tensor {
return this.permute(...dims);
}

// TODO add .max() and .min() methods

/**
* Returns the sum of each row of the input tensor in the given dimension dim.
*
Expand Down Expand Up @@ -759,6 +759,36 @@ export class Tensor {
return mean(this, dim, keepdim);
}

min(dim = null, keepdim = false) {
if (dim !== null) {
throw new Error("`dim !== null` not yet implemented.");
}
const value = min(this.data)[0];
return new Tensor(this.type, [value], []);
}
max(dim = null, keepdim = false) {
if (dim !== null) {
throw new Error("`dim !== null` not yet implemented.");
}
const value = max(this.data)[0];
return new Tensor(this.type, [value], []);
}

argmin(dim = null, keepdim = false) {
if (dim !== null) {
throw new Error("`dim !== null` not yet implemented.");
}
const index = min(this.data)[1];
return new Tensor('int64', [BigInt(index)], []);
}
argmax(dim = null, keepdim = false) {
if (dim !== null) {
throw new Error("`dim !== null` not yet implemented.");
}
const index = max(this.data)[1];
return new Tensor('int64', [BigInt(index)], []);
}

/**
* Performs Tensor dtype conversion.
* @param {DataType} type The desired data type.
Expand Down Expand Up @@ -892,7 +922,7 @@ export function interpolate(input, [out_height, out_width], mode = 'bilinear', a
* @param {Tensor} input the input tensor
* @param {Object} options the options for the interpolation
* @param {[number, number]|[number, number, number]|[number, number, number, number]} [options.size=null] output spatial size.
* @param {"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
* @param {"nearest"|"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
* @returns {Promise<Tensor>} The interpolated tensor.
*/
export async function interpolate_4d(input, {
Expand Down Expand Up @@ -922,7 +952,9 @@ export async function interpolate_4d(input, {
}

let op;
if (mode === 'bilinear') {
if (mode === 'nearest') {
op = await TensorOpRegistry.nearest_interpolate_4d;
} else if (mode === 'bilinear') {
op = await TensorOpRegistry.bilinear_interpolate_4d;
} else if (mode === 'bicubic') {
op = await TensorOpRegistry.bicubic_interpolate_4d;
Expand Down Expand Up @@ -963,13 +995,13 @@ export async function rfft(x, a) {
* Returns the k largest elements of the given input tensor.
* Inspired by https://pytorch.org/docs/stable/generated/torch.topk.html
* @param {Tensor} x the input tensor
* @param {number} k the k in "top-k"
* @param {number} [k] the k in "top-k"
* @returns {Promise<[Tensor, Tensor]>} the output tuple of (Tensor, LongTensor) of top-k elements and their indices.
*/
export async function topk(x, k) {
const op = await TensorOpRegistry.top_k;

if (k === null) {
if (k == null) {
k = x.dims.at(-1);
} else {
k = Math.min(k, x.dims.at(-1));
Expand Down Expand Up @@ -998,10 +1030,10 @@ const arrayToIndexTensor = (array) => new Tensor('int64', array, [array.length])
export async function slice(data, starts, ends, axes, steps) {
const op = await TensorOpRegistry.slice;
return await op({
x: data,
s: arrayToIndexTensor(starts),
e: arrayToIndexTensor(ends),
a: arrayToIndexTensor(axes),
x: data,
s: arrayToIndexTensor(starts),
e: arrayToIndexTensor(ends),
a: arrayToIndexTensor(axes),
t: arrayToIndexTensor(steps ?? new Array(axes.length).fill(1)),
});
}
Expand Down
Loading
Loading