Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AVG Pooling cpu implementation #2296

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 103 additions & 1 deletion candle-nn/src/ops.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use candle::{CpuStorage, DType, Layout, Module, Result, Shape, Tensor, D};
use num_traits::ToPrimitive;
use candle::{CpuStorage, DType, Layout, Module, Result, Shape, Tensor, D, CustomOp1};
use rayon::prelude::*;

/// Applies the softmax function to the input tensor, rescaling the element so that elements on
Expand Down Expand Up @@ -947,3 +948,104 @@ impl Module for Identity {
Ok(xs.clone())
}
}

fn start_index(curr_index: f32, output_size: f32, input_size: f32) -> usize {
(curr_index * input_size / output_size).floor() as usize
}

fn end_index(curr_index: f32, output_size: f32, input_size: f32) -> usize {
((curr_index + 1.0f32) * input_size / output_size).ceil() as usize
}

pub fn adaptive_avg_pool1d<T>(xs: &Vec<T>, layout: &Layout, output_size: usize) -> Vec<T>
where
T: Send+ Sync + ToPrimitive + num_traits::NumCast + Copy + std::ops::Add<Output=T> + std::ops::Div<Output = T> + Default,
{
let ndim = layout.dims().len();
let channels = if ndim == 3 {
layout.dims()[1]
} else {
1
};
let B = layout.dims()[0];
let L = layout.dims()[ndim - 1];
let mut ys: Vec<T> = vec![T::from(0).unwrap(); B * channels * output_size];
ys.par_chunks_mut(output_size).enumerate().for_each(|(idx, chunk)| {
let b = idx / channels;
let c = idx % channels;

for i in 0..output_size {
let if32 = i.to_f32().unwrap();
let lf32 = L.to_f32().unwrap();
let output_size_f32 = output_size.to_f32().unwrap();
let start = start_index(if32, output_size_f32, lf32);
let end = end_index(if32, output_size_f32, lf32);
let mut sum: T = T::from(0).unwrap();

for j in start..end {
sum = sum + xs[b * channels * L + c * L + j];
}

let diff: T = T::from(end - start).unwrap();
chunk[i] = sum / diff;
}
});

ys
}

#[derive(Clone, Debug)]
pub struct AdaptiveAvgPool1d {
output_size: usize,
}

impl AdaptiveAvgPool1d {
pub fn new(output_size: usize) -> AdaptiveAvgPool1d {
Self { output_size }
}
}

impl CustomOp1 for AdaptiveAvgPool1d {
fn name(&self) -> &'static str {
"adaptive_avg_pool1d"
}

fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)> {
let x = match storage {
CpuStorage::F32(slice) => {
CpuStorage::F32(adaptive_avg_pool1d::<f32>(slice, layout, self.output_size))
}
CpuStorage::BF16(slice) => {
CpuStorage::BF16(adaptive_avg_pool1d::<half::bf16>(slice, layout, self.output_size))
}
CpuStorage::F16(slice) => {
CpuStorage::F16(adaptive_avg_pool1d::<half::f16>(slice, layout, self.output_size))
}
CpuStorage::F64(slice) => {
CpuStorage::F64(adaptive_avg_pool1d::<f64>(slice, layout, self.output_size))
}
CpuStorage::U8(slice) => {
CpuStorage::U8(adaptive_avg_pool1d::<u8>(slice, layout, self.output_size))
}
CpuStorage::U32(slice) => {
CpuStorage::U32(adaptive_avg_pool1d::<u32>(slice, layout, self.output_size))
}
CpuStorage::I64(slice) => {
CpuStorage::I64(adaptive_avg_pool1d::<i64>(slice, layout, self.output_size))
}
};

let mut dims = layout.dims().to_vec();
let last_index = dims.len() - 1;
dims[last_index] = self.output_size;
let new_shape = Shape::from_dims(&dims);

Ok((x, new_shape))
}
}

impl Module for AdaptiveAvgPool1d {
fn forward(&self, xs: &Tensor) -> Result<Tensor> {
xs.apply_op1_no_bwd(&AdaptiveAvgPool1d { output_size: self.output_size })
}
}
26 changes: 25 additions & 1 deletion candle-nn/tests/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ extern crate intel_mkl_src;
#[cfg(feature = "accelerate")]
extern crate accelerate_src;

use candle::{test_device, test_utils::to_vec3_round, Device, Result, Tensor};
use candle::{test_device, test_utils::to_vec3_round, Device, Result, Tensor, Module};
use candle::test_utils::to_vec2_round;

fn softmax(device: &Device) -> Result<()> {
let data = &[[[3f32, 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]];
Expand Down Expand Up @@ -206,10 +207,33 @@ fn sigmoid(device: &Device) -> Result<()> {
Ok(())
}

fn adaptive_avg_1d(device: &Device) -> Result<()> {
let data_in_3d = &[[[3f32, 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]];
let data_in_2d = &[[3f32, 1., 4.], [1., 5., 9.]];

let tensor_in_3d = Tensor::new(data_in_3d, device)?;
let tensor_in_2d = Tensor::new(data_in_2d, device)?;

let avgpool_1d = candle_nn::ops::AdaptiveAvgPool1d::new(2);

let out_3d = avgpool_1d.forward(&tensor_in_3d)?;
let expected_out_3d = &[[[2.0, 2.5], [3.0, 7.0]], [[1.5, 4.0], [5.0, 5.0]]];
assert!(out_3d.dims() == &[2, 2, 2]);
assert_eq!(to_vec3_round(&out_3d, 4)?, expected_out_3d);

let out_2d = avgpool_1d.forward(&tensor_in_2d)?;
let expected_out_2d = &[[2.0, 2.5], [3.0, 7.0]];
assert!(out_2d.dims() == &[2, 2]);
assert_eq!(to_vec2_round(&out_2d, 4)?, expected_out_2d);

Ok(())
}

test_device!(ropei, ropei_cpu, ropei_gpu, ropei_metal);
test_device!(rope, rope_cpu, rope_gpu, rope_metal);
test_device!(rope_thd, rope_thd_cpu, rope_thd_gpu, rope_thd_metal);
test_device!(softmax, softmax_cpu, softmax_gpu, softmax_metal);
test_device!(rms_norm, rms_norm_cpu, rms_norm_gpu, rms_norm_metal);
test_device!(layer_norm, ln_cpu, ln_gpu, ln_metal);
test_device!(sigmoid, sigmoid_cpu, sigmoid_gpu, sigmoid_metal);
test_device!(adaptive_avg_1d, adaptive_avg_1d_cpu, adaptive_avg_1d_gpu, adaptive_avg_1d_metal);