Resizing NV12 image using Nearest Neighbor Interpolation and Bilinear Interpolation algorithms
NV12 Nearest Neighbor Interpolation scaling and Bilinear Interpolation scaling.
Introduction
This article is an optimized NV12 image scaling program.
There are different kinds of image scaling algorithms. The complexity of the algorithm for image scaling is related with the loss of image quality and low performance. I decided to choose the most simple ones which are 'nearest neighbor interpolation' and bilinear interpolation to resize NV12 image.
Background
NV12 is a kind of YUV series format. Before you read my tip.You need to have some basic concept of the format.And know what are interpolation scaling algorithms.
If you have tired RGBA format image scale before,it would be easier for you to understand how my program works.
NV12 format
NV12 format image array in memory is like: YYYYYYYY... UVUV... NV12 is a planar format. It is also called YUV420sp. There are three planes:
- The length of Y plane in memory is 'width * height'.
- The length of U or V plane in memory is 'width * height / 4'.
- U and V is interleaved.
- Y plane is grey value if discarding U and V plane
So 'width * height * 3 / 2' is the total memory length of the image. Here is a more clear 8*4 resolution sample:
Logical view:
Obviously, width = 8, height = 4
- ylen = 8*4, ulen = 8*4/4, vlen= 8*4/4.
- total_length = ylen + ulen + vlen = ylen * 3 / 2
and every four Y value matches the same U value and V value.
For example:
- Y00 Y01 Y10 Y11 share U00 and V00
- Y20 Y21 Y30 Y31 share U10 and V10
Algorithms
- Nearest interpolation
- Bilinear interpolation
srcX = dstX * (srcWidth / dstWidth), srcY = dstY * (srcHeight / dstHeight)
The proportion usually has a decimal part. This algorithm simply use 'round up', store the nearest pixel value from the source image in the dest image array. So the effect would not be great and usually will have some serious mosaic.
Bilinear interpolation uses both fractional part and integer together to calculate the final pixel value according to four pixels.The fractional part is used as weighted value. It removes sharp and mosaic.
input: src_nv12_array, src_width, src_height,dest_width,dest_height
output: dst_nv12_array
Code
This is a C version optimized program.
- Restrict and register keyword
- Use shift operation to optimize float division
- Move irrelevant code out of inner loop
- Better use logic operation but not arithmetic operation in loop
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/stat.h>
typedef unsigned char uint8_t;
/**
* @param src input nv12 raw data array
* @param dst output nv12 raw data result,
* the memory need to be allocated outside of the function
* @param srcWidth width of the input nv12 image
* @param srcHeight height of the input nv12 image
* @param dstWidth
* @param dstHeight
*/
void nv12_nearest_scale(uint8_t* __restrict src, uint8_t* __restrict dst,
int srcWidth, int srcHeight, int dstWidth, int
dstHeight) //restrict keyword is for compiler to optimize program
{
register int sw = srcWidth; //register keyword is for local var to accelorate
register int sh = srcHeight;
register int dw = dstWidth;
register int dh = dstHeight;
register int y, x;
unsigned long int srcy, srcx, src_index, dst_index;
unsigned long int xrIntFloat_16 = (sw << 16) / dw + 1; //better than float division
unsigned long int yrIntFloat_16 = (sh << 16) / dh + 1;
uint8_t* dst_uv = dst + dh * dw; //memory start pointer of dest uv
uint8_t* src_uv = src + sh * sw; //memory start pointer of source uv
uint8_t* dst_uv_yScanline;
uint8_t* src_uv_yScanline;
uint8_t* dst_y_slice = dst; //memory start pointer of dest y
uint8_t* src_y_slice;
uint8_t* sp;
uint8_t* dp;
for (y = 0; y < (dh & ~7); ++y) //'dh & ~7' is to generate faster assembly code
{
srcy = (y * yrIntFloat_16) >> 16;
src_y_slice = src + srcy * sw;
if((y & 1) == 0)
{
dst_uv_yScanline = dst_uv + (y / 2) * dw;
src_uv_yScanline = src_uv + (srcy / 2) * sw;
}
for(x = 0; x < (dw & ~7); ++x)
{
srcx = (x * xrIntFloat_16) >> 16;
dst_y_slice[x] = src_y_slice[srcx];
if((y & 1) == 0) //y is even
{
if((x & 1) == 0) //x is even
{
src_index = (srcx / 2) * 2;
sp = dst_uv_yScanline + x;
dp = src_uv_yScanline + src_index;
*sp = *dp;
++sp;
++dp;
*sp = *dp;
}
}
}
dst_y_slice += dw;
}
}
void nv12_bilinear_scale (uint8_t* src, uint8_t* dst,
int srcWidth, int srcHeight, int dstWidth,int dstHeight)
{
int x, y;
int ox, oy;
int tmpx, tmpy;
int xratio = (srcWidth << 8)/dstWidth;
int yratio = (srcHeight << 8)/dstHeight;
uint8_t* dst_y = dst;
uint8_t* dst_uv = dst + dstHeight * dstWidth;
uint8_t* src_y = src;
uint8_t* src_uv = src + srcHeight * srcWidth;
uint8_t y_plane_color[2][2];
uint8_t u_plane_color[2][2];
uint8_t v_plane_color[2][2];
int j,i;
int size = srcWidth * srcHeight;
int offsetY;
int y_final, u_final, v_final;
int u_final1 = 0;
int v_final1 = 0;
int u_final2 = 0;
int v_final2 = 0;
int u_final3 = 0;
int v_final3 = 0;
int u_final4 = 0;
int v_final4 = 0;
int u_sum = 0;
int v_sum = 0;
tmpy = 0;
for (j = 0; j < (dstHeight & ~7); ++j)
{
//tmpy = j * yratio;
oy = tmpy >> 8;
y = tmpy & 0xFF;
tmpx = 0;
for (i = 0; i < (dstWidth & ~7); ++i)
{
// tmpx = i * xratio;
ox = tmpx >> 8;
x = tmpx & 0xFF;
offsetY = oy * srcWidth;
//YYYYYYYYYYYYYYYY
y_plane_color[0][0] = src[ offsetY + ox ];
y_plane_color[1][0] = src[ offsetY + ox + 1 ];
y_plane_color[0][1] = src[ offsetY + srcWidth + ox ];
y_plane_color[1][1] = src[ offsetY + srcWidth + ox + 1 ];
int y_final = (0x100 - x) * (0x100 - y) * y_plane_color[0][0]
+ x * (0x100 - y) * y_plane_color[1][0]
+ (0x100 - x) * y * y_plane_color[0][1]
+ x * y * y_plane_color[1][1];
y_final = y_final >> 16;
if (y_final>255)
y_final = 255;
if (y_final<0)
y_final = 0;
dst_y[ j * dstWidth + i] = (uint8_t)y_final; //set Y in dest array
//UVUVUVUVUVUV
if((j & 1) == 0) //j is even
{
if((i & 1) == 0) //i is even
{
u_plane_color[0][0] = src[ size + offsetY + ox ];
u_plane_color[1][0] = src[ size + offsetY + ox ];
u_plane_color[0][1] = src[ size + offsetY + ox ];
u_plane_color[1][1] = src[ size + offsetY + ox ];
v_plane_color[0][0] = src[ size + offsetY + ox + 1];
v_plane_color[1][0] = src[ size + offsetY + ox + 1];
v_plane_color[0][1] = src[ size + offsetY + ox + 1];
v_plane_color[1][1] = src[ size + offsetY + ox + 1];
}
else //i is odd
{
u_plane_color[0][0] = src[ size + offsetY + ox - 1 ];
u_plane_color[1][0] = src[ size + offsetY + ox + 1 ];
u_plane_color[0][1] = src[ size + offsetY + ox - 1 ];
u_plane_color[1][1] = src[ size + offsetY + ox + 1 ];
v_plane_color[0][0] = src[ size + offsetY + ox ];
v_plane_color[1][0] = src[ size + offsetY + ox + 1 ];
v_plane_color[0][1] = src[ size + offsetY + ox ];
v_plane_color[1][1] = src[ size + offsetY + ox + 1 ];
}
}
else // j is odd
{
if((i & 1) == 0) //i is even
{
u_plane_color[0][0] = src[ size + offsetY + ox ];
u_plane_color[1][0] = src[ size + offsetY + ox ];
u_plane_color[0][1] = src[ size + offsetY + srcWidth + ox ];
u_plane_color[1][1] = src[ size + offsetY + srcWidth + ox ];
v_plane_color[0][0] = src[ size + offsetY + ox + 1];
v_plane_color[1][0] = src[ size + offsetY + ox + 1];
v_plane_color[0][1] = src[ size + offsetY + srcWidth + ox + 1];
v_plane_color[1][1] = src[ size + offsetY + srcWidth + ox + 1];
}
else //i is odd
{
u_plane_color[0][0] = src[ size + offsetY + ox - 1 ];
u_plane_color[1][0] = src[ size + offsetY + srcWidth + ox - 1 ];
u_plane_color[0][1] = src[ size + offsetY + ox + 1];
u_plane_color[1][1] = src[ size + offsetY + srcWidth + ox + 1];
v_plane_color[0][0] = src[ size + offsetY + ox ];
v_plane_color[1][0] = src[ size + offsetY + srcWidth + ox ];
v_plane_color[0][1] = src[ size + offsetY + ox + 2 ];
v_plane_color[1][1] = src[ size + offsetY + srcWidth + ox + 2 ];
}
}
int u_final = (0x100 - x) * (0x100 - y) * u_plane_color[0][0]
+ x * (0x100 - y) * u_plane_color[1][0]
+ (0x100 - x) * y * u_plane_color[0][1]
+ x * y * u_plane_color[1][1];
u_final = u_final >> 16;
int v_final = (0x100 - x) * (0x100 - y) * v_plane_color[0][0]
+ x * (0x100 - y) * v_plane_color[1][0]
+ (0x100 - x) * y * v_plane_color[0][1]
+ x * y * v_plane_color[1][1];
v_final = v_final >> 16;
if((j & 1) == 0)
{
if((i & 1) == 0)
{
//set U in dest array
dst_uv[(j / 2) * dstWidth + i ] = (uint8_t)(u_sum / 4);
//set V in dest array
dst_uv[(j / 2) * dstWidth + i + 1] = (uint8_t)(v_sum / 4);
u_sum = 0;
v_sum = 0;
}
}
else
{
u_sum += u_final;
v_sum += v_final;
}
tmpx += xratio;
}
tmpy += yratio;
}
}
int ImageResize(uint8_t * src, uint8_t* dst, int sw,
int sh,int dw,int dh)
{
if( (src == NULL) || (dst == NULL) || (0 == dw) || (0 == dh) ||
(0 == sw) || (0 == sh))
{
printf("params error\n");
return -1;
}
nv12_nearest_scale(src, dst, sw, sh, dw, dh);
//nv12_bilinear_scale(src, dst, sw, sh, dw, dh);
//greyscale(src, dst, sw, sh, dw, dh);
return 0;
}
int main(int argc,char**argv)
{
if(argc!=7)
{
printf("Input Error!\n");
printf("Usage : <Input NV12file> <Output NV12file>
<sw><sh> <dw> <dh>");
return 0;
}
FILE *inputfp = NULL;
FILE *outputfp = NULL;
inputfp = fopen(argv[1], "rb");
if (!inputfp)
{
fprintf(stderr, "fopen failed for input file[%s]\n",argv[1]);
return -1;
}
outputfp = fopen(argv[2], "wb");
if (!outputfp)
{
fprintf(stderr, "fopen failed for output file[%s]\n",argv[2]);
return -1;
}
int sw = atoi(argv[3]);
int sh = atoi(argv[4]);
int dw = atoi(argv[5]);
int dh = atoi(argv[6]);
if(sw <= 0 || sh <= 0 || dw <= 0 || dh <=0)
{
fprintf(stderr, "parameter error [sw= %d,sh= %d,dw= %d,dh= %d]\n",sw,sh,dw,dh);
return -1;
}
int inPixels = sw * sh * 3/2;
int outPixels = dw * dh * 3/2;
uint8_t* pInBuffer = (uint8_t*)malloc(inPixels);
fread(pInBuffer,1,inPixels,inputfp);
uint8_t* pOutBuffer = (uint8_t*)malloc(outPixels);
ImageResize(pInBuffer,pOutBuffer,sw,sh,dw,dh);
//compute frame per second
int i = 0;
clock_t start = clock();
for(;i<1000;++i)
{
ImageResize(pInBuffer,pOutBuffer,1536,1088,1024,600);//can change to be any resolution
}
clock_t finish = clock();
float duration = (float)(finish-start)/CLOCKS_PER_SEC;
float fps = 1000 / duration;
printf("nv12Scaling:%d*%d-->%d*%d,time cost:%6.2ffps\n",sw,sh,dw,dh,fps);
fwrite(pOutBuffer, 1 , outPixels, outputfp);
free(pInBuffer);
free(pOutBuffer);
fclose(inputfp);
fclose(outputfp);
pInBuffer = NULL;
pOutBuffer = NULL;
inputfp = NULL;
outputfp = NULL;
return 0;
}
Furthermore, I would like to use the ARM assembly language to optimize my program. Maybe the NEON vectorized assembly for the Android project. Or simply use different instrinsics of different CPUs instead.But sometimes it needs a huge change(including re-design algorithm) while translating C code to assembly code. This depends on the feature of CPU.