#define STORE(x,SATD)\
{\
	mode_scores[x] = SATD[i_mode][idx];\
	modes[x] = i_mode;\
}
#define ACCUMULATE(x,y)\
{\
	mode_scores[y] = mode_scores[x];\
	modes[y] = modes[x];\
}
#define SETUP_BEST_MODES(SATD)\
	int modes[3];\
	int mode_scores[3];\
	for( i = 0; i < 3; i++)\
	{\
		mode_scores[i] = COST_MAX;\
		modes[i] = 0;\
	}\
	for( i = 0; i < i_max; i++)\
	{\
		i_mode = predict_mode[i];\
		if(SATD[i_mode][idx] < mode_scores[0])\
		{\
			ACCUMULATE(1,2)\
			ACCUMULATE(0,1)\
			STORE(0,SATD)\
		}\
		else if(SATD[i_mode][idx] < mode_scores[1])\
		{\
			ACCUMULATE(0,1)\
			STORE(1,SATD);\
		}\
		else if(SATD[i_mode][idx] < mode_scores[2])\
		{\
			STORE(2,SATD);\
		}\
	}
#define STORE16(x,SATD)\
{\
	mode_scores[x] = SATD[i_mode];\
	modes[x] = i_mode;\
}
#define ACCUMULATE(x,y)\
{\
	mode_scores[y] = mode_scores[x];\
	modes[y] = modes[x];\
}
#define SETUP_BEST_MODES16(SATD)\
	int modes[3];\
	int mode_scores[3];\
	for( i = 0; i < 3; i++)\
	{\
		mode_scores[i] = COST_MAX;\
		modes[i] = 0;\
	}\
	for( i = 0; i < i_max; i++)\
	{\
		i_mode = predict_mode[i];\
		if(SATD[i_mode] < mode_scores[0])\
		{\
			ACCUMULATE(1,2)\
			ACCUMULATE(0,1)\
			STORE16(0,SATD)\
		}\
		else if(SATD[i_mode] < mode_scores[1])\
		{\
			ACCUMULATE(0,1)\
			STORE16(1,SATD);\
		}\
		else if(SATD[i_mode] < mode_scores[2])\
		{\
			STORE16(2,SATD);\
		}\
	}

static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
{
    uint8_t  *p_src = h->mb.pic.p_fenc[0];
    uint8_t  *p_dst = h->mb.pic.p_fdec[0];

    int i, j, idx, x, y;
    int i_max, i_satd, i_best, i_mode, i_thresh;
    int i_pred_mode;
    int predict_mode[9];

    if( h->mb.i_type == I_16x16 )
    {
        int old_pred_mode = a->i_predict16x16;
        i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 33/32;

        i_best = a->i_satd_i16x16;
        predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
		
		SETUP_BEST_MODES16(a->i_satd_i16x16_dir);
		
        for( i = 0; i < i_max; i++ )
        {
            int i_mode = predict_mode[i];
            if( i_mode == old_pred_mode || ((i_mode != modes[0]) && (i_mode != modes[1]) 
			&& (i_mode != modes[2]) && a->i_satd_i16x16_dir[i_mode] > i_thresh ))
                continue;
            h->mb.i_intra16x16_pred_mode = i_mode;
            i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
            COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
        }
    }
    else if( h->mb.i_type == I_4x4 )
    {
        uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
        int i_nnz = 0;
        for( idx = 0; idx < 16; idx++ )
        {
            uint8_t *p_src_by;
            uint8_t *p_dst_by;
            i_best = COST_MAX;
			i_thresh = a->i_satd_i4x4_dir[a->i_predict4x4[idx]][idx] * 10/8;

            i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
            x = block_idx_x[idx];
            y = block_idx_y[idx];

            p_src_by = p_src + 4*x + 4*y*FENC_STRIDE;
            p_dst_by = p_dst + 4*x + 4*y*FDEC_STRIDE;
            predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );

            if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                /* emulate missing topright samples */
                *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
				
			SETUP_BEST_MODES(a->i_satd_i4x4_dir);

            for( i = 0; i < i_max; i++ )
            {
                i_mode = predict_mode[i];
				if( (i_mode != modes[0]) && (i_mode != modes[1]) && (i_mode != modes[2]) && a->i_satd_i4x4_dir[i_mode][idx] > i_thresh)
                    continue;
                h->predict_4x4[i_mode]( p_dst_by );
                i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode, i_best );

                if( i_best > i_satd )
                {
                    a->i_predict4x4[idx] = i_mode;
                    i_best = i_satd;
                    pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE);
                    pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE);
                    pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE);
                    pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE);
                    i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
                }
            }

            *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0];
            *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1];
            *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2];
            *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3];
            h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;

            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
        }
    }
    else if( h->mb.i_type == I_8x8 )
    {
        DECLARE_ALIGNED( uint8_t, edge[33], 8 );
        for( idx = 0; idx < 4; idx++ )
        {
            uint64_t pels_h = 0;
            uint8_t pels_v[7];
            int i_nnz[3];
            uint8_t *p_src_by;
            uint8_t *p_dst_by;
            int j;
            i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 9/8;

            i_best = COST_MAX;
            i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
            x = idx&1;
            y = idx>>1;

            p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
            p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
            predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
            x264_predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
			
			SETUP_BEST_MODES(a->i_satd_i8x8_dir);
			
            for( i = 0; i < i_max; i++ )
            {
                i_mode = predict_mode[i];
                if( (i_mode != modes[0]) && (i_mode != modes[1]) && (i_mode != modes[2]) && a->i_satd_i8x8_dir[i_mode][idx] > i_thresh)
                    continue;
                h->predict_8x8[i_mode]( p_dst_by, edge );
                i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode, i_best );

                if( i_best > i_satd )
                {
                    a->i_predict8x8[idx] = i_mode;
                    i_best = i_satd;

                    pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE);
                    if( !(idx&1) )
                        for( j=0; j<7; j++ )
                            pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
                    for( j=0; j<3; j++ )
                        i_nnz[j] = h->mb.cache.non_zero_count[x264_scan8[4*idx+j+1]];
                }
            }

            *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h;
            if( !(idx&1) )
                for( j=0; j<7; j++ )
                    p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
            for( j=0; j<3; j++ )
                h->mb.cache.non_zero_count[x264_scan8[4*idx+j+1]] = i_nnz[j];

            x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
        }
    }

    /* RD selection for chroma prediction */
    predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
    if( i_max > 1 )
    {
        i_thresh = a->i_satd_i8x8chroma * 5/4; 

        for( i = j = 0; i < i_max; i++ )
            if( a->i_satd_i8x8chroma_dir[i] < i_thresh &&
                predict_mode[i] != a->i_predict8x8chroma )
            {
                predict_mode[j++] = predict_mode[i];
            }
        i_max = j;

        if( i_max > 0 )
        {
            int i_chroma_lambda = i_qp0_cost2_table[h->mb.i_chroma_qp];
            /* the previous thing encoded was x264_intra_rd(), so the pixels and
             * coefs for the current chroma mode are still around, so we only
             * have to recount the bits. */
            i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0, COST_MAX );
            for( i = 0; i < i_max; i++ )
            {
                i_mode = predict_mode[i];
                h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
                h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
                /* if we've already found a mode that needs no residual, then
                 * probably any mode with a residual will be worse.
                 * so avoid dct on the remaining modes to improve speed. */
                i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00, i_best );
                COPY2_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode );
            }
            h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
        }
    }
}
