/*
**************************************************************************
This Source Code Form is subject to the terms of the Mozilla Public
License, v. 2.0. If a copy of the MPL was not distributed with this file,
You can obtain one at http://mozilla.org/MPL/2.0/.

Copyright (c) 2014, Marvell International Ltd.

Alternatively, this software may be distributed under the terms of the GNU
General Public License Version 2, and any use shall comply with the terms and
conditions of the GPL.  A copy of the GPL is available at
http://www.gnu.org/licenses/old-licenses/gpl-2.0.html

THE FILE IS DISTRIBUTED AS-IS, WITHOUT WARRANTY OF ANY KIND, AND THE
IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE
ARE EXPRESSLY DISCLAIMED.  The GPL license provides additional details about
this warranty disclaimer.
******************************************************************************
*/


/**
 *
 * \file rotate_api.c
 *
 * \brief Driver routines for Rotate HW block of ASIC.
 *
 * See the rotate_api.h file for details on usage.
 *
 */

#include "rotate_api.h"

#define _ALLOW_ROTATE_PRIV_INCLUSION_H
#include "rotate-setup-private.h"
#include "map_mem.h"
#include "logger.h"

#define DO_CHAIN 1

#define MAX_LINES_PROCESSED 128

#define DBG_PRFX "ROTATE: "
#define LOGGER_MODULE_MASK DEBUG_LOGGER_MODULE_DEVICES | LOGGER_SUBMODULE_BIT( 16 )

struct rotate_and_place_s
{
    int out_total_width;  //< in pixels of the output image, aka page
    int out_total_height; //< in pixels
    int bpp;          //< 1,2,8,24, 32 supported
    int out_strip_height; //< in pixels, a square, 128 is common
    int rotate;       //< 0, 90, 180, 270  Clockwise
    int flip_x;
    int flip_y;
    int image_width;  //< input in pixels
    int image_height; //< input in pixels
    int offset_x;     //< destination start position offset in pixels
    int offset_y;     //< destination start position offset in pixels
    uint32_t mask, or_value;    // the mask and replace values for the rotate core block.
    uint32_t *dst_hw_addr_list;
    int num_strips;   //< output number of strips out_buf_strip[num_strips]
    struct BigBuffer_s **out_buf_strip;
    struct rotate_device rotate_device;
};

struct rotate_udma_desc_s
{
    uint32_t control;
    uint32_t length;
    uint32_t src;
    uint32_t next;
    uint32_t source_lo0;
    uint32_t source_hi0;
    uint32_t source_lo1;
    uint32_t source_hi1;
    void *this_hw_addr;
};

static struct rotate_and_place_s *rotate_place_singleton = NULL;
static struct rotate_udma_desc_s **iudma_desc = NULL;
static struct rotate_udma_desc_s **oudma_desc = NULL;
static uint32_t num_idma_desc, num_odma_desc;
static uint32_t current_output_line_cnt;
static int32_t strip_width_count;
static sem_t interrupt_sema;

#define	ROTATE_MAX_DESC_SAVE	2
#define	ROTATE_DESC_SAVE_IUDMA	0
#define	ROTATE_DESC_SAVE_OUDMA	1
static void					*rotate_save_addr[ROTATE_MAX_DESC_SAVE];
struct rotate_udma_desc_s	*rotate_save_desc[ROTATE_MAX_DESC_SAVE];

static int	rotate_init_flag = 0;

static int notify_idma_interrupt(int cause)
{
    DBG_PRINTF_INFO("%s %d: Entry\n", __func__, __LINE__);
    return 0x00;
}

static int notify_iudma_interrupt(int cause)
{

    DBG_PRINTF_INFO("%s %d: Entry cause %d\n", __func__, __LINE__, cause);
    return 0x00;
}

static int notify_odma_interrupt(int cause)
{
    DBG_PRINTF_INFO("%s %d: Entry\n", __func__, __LINE__);
    return 0x00;
}
static int got_interrupt = 0;
static int notify_oudma_interrupt(int cause)
{
    DBG_PRINTF_INFO("%s %d: Entry cause %d\n", __func__, __LINE__, cause);
    if (cause != INT_COMPLETE)
    {
        got_interrupt = cause;
    }
    sem_post(&interrupt_sema);
    return 0x00;
}

struct rotate_and_place_s* rotate_and_place_start(int output_width,
                                                  int output_height, int bpp, int strip_height, int num_strips,
                                                  struct BigBuffer_s *out_buf_strip[])
{
    int i;
    DBG_PRINTF_INFO("%s Entry\n", __func__);

    if (rotate_place_singleton)
    {
        DBG_PRINTF_ERR("error, busy\n");
        errno = -EBUSY;
        return NULL;
    }

    if (!output_width || !output_height ||
        !strip_height || !num_strips || !bpp)
    {
        DBG_PRINTF_ERR("error out_width %d out_height %d str_hei %d num_str %d\n",
                       output_width, output_height, strip_height, num_strips);
        return NULL;
    }
    // see if bpp is valid, fail if not
    switch (bpp)
    {
        case 1:
        case 2:
        case 4:
        case 8:
        case 24:
        case 32:
            break;
        default:
            DBG_PRINTF_ERR("Invalid bpp; bpp = %d\n", bpp);
            return NULL;
    }
    DBG_PRINTF_INFO("%s numstrips=%d\n", __func__, num_strips);
    for (i = 0; i < num_strips; i++) if (!out_buf_strip[i]) return NULL;


    rotate_place_singleton = (struct rotate_and_place_s *)
        MEM_MALLOC(sizeof(struct rotate_and_place_s));

    DBG_PRINTF_INFO("%s after singleton\n", __func__);
    num_idma_desc = num_odma_desc = 0;
    if (!rotate_place_singleton)
    {
        DBG_PRINTF_ERR("%s %d: Memory allocation failed", __func__, __LINE__);
        return NULL;
    }

    rotate_place_singleton->out_total_width = output_width;
    rotate_place_singleton->out_total_height = output_height;
    rotate_place_singleton->bpp = bpp;
    rotate_place_singleton->out_strip_height = strip_height;
    rotate_place_singleton->num_strips = num_strips;
    rotate_place_singleton->out_buf_strip = out_buf_strip;

    if (SYS_OK != init_rotate_hardware(
            &rotate_place_singleton->rotate_device)) goto err_out;

    rotate_place_singleton->rotate_device.notify_idma_interrupt =
        notify_idma_interrupt;
    rotate_place_singleton->rotate_device.notify_iudma_interrupt =
        notify_iudma_interrupt;
    rotate_place_singleton->rotate_device.notify_odma_interrupt =
        notify_odma_interrupt;
    rotate_place_singleton->rotate_device.notify_oudma_interrupt =
        notify_oudma_interrupt;
    sem_init(&interrupt_sema, 0, 0);

	/* ̂݃fBXNv^̈mۂ */
	if (rotate_init_flag == 0) {
		for (i=0; i<ROTATE_MAX_DESC_SAVE; i++) {
			rotate_save_desc[i] = MEM_MALLOC_UNCACHED(&rotate_save_addr[i], sizeof(struct rotate_udma_desc_s), e_32_byte);
			printf("ROTATE: [%d] descp:%x addr:%x size:%d\n", i, (uint32_t)rotate_save_desc[i], (uint32_t)rotate_save_addr[i], sizeof(struct rotate_udma_desc_s));
		}
		rotate_init_flag = 1;
	}

    DBG_PRINTF_INFO("%s\n", __func__);
    return rotate_place_singleton;

err_out:
    //rotate_place_singleton = MEM_FREE(rotate_place_singleton);
    MEM_FREE_AND_NULL(rotate_place_singleton);
    return NULL;
}
void init_hw(struct rotate_and_place_s *rotate)
{

 //   uninitialize_rotate_hardware(
 //       &rotate->rotate_device);
    rotate->rotate_device.rotate_iudma_regs->UCR = 0;
    rotate->rotate_device.rotate_oudma_regs->UCR = 0;
    init_rotate_hardware(
            &rotate->rotate_device);

    rotate->rotate_device.notify_idma_interrupt =
        notify_idma_interrupt;
    rotate->rotate_device.notify_iudma_interrupt =
        notify_iudma_interrupt;
    rotate->rotate_device.notify_odma_interrupt =
        notify_odma_interrupt;
    rotate->rotate_device.notify_oudma_interrupt =
        notify_oudma_interrupt;
}

static int is_offset_valid(int offset)
{
    if (offset < 0x00) return -EINVAL;
    return SYS_OK;
}


static int is_flip_valid(int flip)
{
    if (0x00 == flip ||
        0x01 == flip) return SYS_OK;
    return -EINVAL;
}

static void config_odma_core_hw(const struct rotate_and_place_s *const rotate_place, uint32_t tile_height, uint32_t tile_width, uint32_t total_width)
{
    rotate_place->rotate_device.rotate_odma_core_regs->ITS = (tile_height << 16) + tile_width * rotate_place->bpp / 8; // tile size
                                                                                                                       // Tile offset register,
    rotate_place->rotate_device.rotate_odma_core_regs->ITOFFST = (tile_width * rotate_place->bpp / 8);
    rotate_place->rotate_device.rotate_odma_core_regs->ICFG = 0;    // put down in row mode
    // tile height register
    rotate_place->rotate_device.rotate_oudma_regs->UCR = 5;   // 2 beats, 1 enable
    rotate_place->rotate_device.rotate_oudma_regs->UIER = 0x1f;
    rotate_place->rotate_device.rotate_oudma_regs->UICR = 0xffffffff;
    rotate_place->rotate_device.rotate_odma_core_regs->IICR = 1;
    switch (rotate_place->rotate)
    {
        case 90:

            //
            // for 90 or 270, the tile width is the same as the height of the output tile.
            rotate_place->rotate_device.rotate_odma_core_regs->ITS = (tile_width << 16) + tile_height * rotate_place->bpp / 8; // tile size
            rotate_place->rotate_device.rotate_odma_core_regs->ITOFFST = tile_height * rotate_place->bpp / 8;
            rotate_place->rotate_device.rotate_odma_core_regs->ILW = ((rotate_place->out_total_width * rotate_place->bpp / 8) << 16) + (rotate_place->image_height * rotate_place->bpp / 8);
            rotate_place->rotate_device.rotate_odma_core_regs->IIH = total_width;    //
            rotate_place->rotate_device.rotate_odma_core_regs->ISOFFST = tile_width * rotate_place->out_total_width * rotate_place->bpp / 8;
            //
            // if flip_x set take the tiles from right to left.
            if (rotate_place->flip_x)
            {
                rotate_place->rotate_device.rotate_odma_core_regs->ITOFFST *= -1;
            }
            // if flip_y set take the swaths from bottom to top.
            if (rotate_place->flip_y)
            {
                rotate_place->rotate_device.rotate_odma_core_regs->ISOFFST *= -1;
            }
            break; 
        case 270:

            //
            // for 90 or 270, the tile width is the same as the height of the output tile.
            rotate_place->rotate_device.rotate_odma_core_regs->ITS = (tile_width << 16) + tile_height * rotate_place->bpp / 8; // tile size
            rotate_place->rotate_device.rotate_odma_core_regs->ITOFFST = tile_height * rotate_place->bpp / 8;
            rotate_place->rotate_device.rotate_odma_core_regs->ILW = ((rotate_place->out_total_width * rotate_place->bpp / 8) << 16) + 
                                                (rotate_place->image_height * rotate_place->bpp / 8);
            // swatch offset register
            rotate_place->rotate_device.rotate_odma_core_regs->ISOFFST = tile_width * rotate_place->out_total_width * rotate_place->bpp / 8;
            rotate_place->rotate_device.rotate_odma_core_regs->IIH = total_width;    //
            // if flip_x set take the tiles  from right to left
            if (rotate_place->flip_x)
            {
                rotate_place->rotate_device.rotate_odma_core_regs->ITOFFST *= -1;
            }
            // if not flip_y take the swaths from bottom to top.
            if (!rotate_place->flip_y)
            {
                rotate_place->rotate_device.rotate_odma_core_regs->ISOFFST *= -1;
            }
            break; 
        case 0:
            // swath offset register
            rotate_place->rotate_device.rotate_odma_core_regs->ISOFFST = tile_height * rotate_place->out_total_width * rotate_place->bpp / 8; // size of a swatch
            // line length and line stride register
            rotate_place->rotate_device.rotate_odma_core_regs->ILW = ((rotate_place->out_total_width * rotate_place->bpp / 8) << 16) + 
                                            (rotate_place->image_width * rotate_place->bpp / 8);

            rotate_place->rotate_device.rotate_odma_core_regs->IIH = rotate_place->out_strip_height;
            // if flip_x set take the swaths from bottom to top
            if (rotate_place->flip_x)
            {
                rotate_place->rotate_device.rotate_odma_core_regs->ISOFFST *= -1;
            }
            // if flip_y set take the tiles from right to left.
            if (rotate_place->flip_y)
            {
                rotate_place->rotate_device.rotate_odma_core_regs->ITOFFST *= -1;
            }
            break; 
        case 180:
            // swath offset register
            rotate_place->rotate_device.rotate_odma_core_regs->ISOFFST = tile_height * rotate_place->out_total_width * rotate_place->bpp / 8; // size of a swatch
            // line width and line stride register
            rotate_place->rotate_device.rotate_odma_core_regs->ILW = ((rotate_place->out_total_width * rotate_place->bpp / 8) << 16) + 
                                            (rotate_place->image_width * rotate_place->bpp / 8);
            // height register
            rotate_place->rotate_device.rotate_odma_core_regs->IIH = rotate_place->out_strip_height;
            // if flip_x set take swaths from bottom to top
            if (!rotate_place->flip_x)
            {
                rotate_place->rotate_device.rotate_odma_core_regs->ISOFFST *= -1;
            }
            // if not set take tiles from bottom to top.
            if (!rotate_place->flip_y)
            {
                rotate_place->rotate_device.rotate_odma_core_regs->ITOFFST *= -1;
            }
            break; 
        default:
            break;

    }
    if (rotate_place->bpp == 24)
    {
        rotate_place->rotate_device.rotate_odma_core_regs->ICFG |= 4;       // set the bit to do 24 bpp xfers
    }
}
/**
 * \brief Setup the idma and iudma registers 
 * \param rotate_place The structure with the info on the rotate 
 *        requirements
 * \param tile_height The height of the tile 
 * \param tile_width The width of the tile in pixels 
 * 
 */
static void config_idma_core_hw(const struct rotate_and_place_s *const rotate_place, uint32_t tile_height, uint32_t tile_width, uint32_t strip_height)
{
    rotate_place->rotate_device.rotate_idma_core_regs->ILW = ((rotate_place->image_width * rotate_place->bpp / 8) << 16) + (rotate_place->image_width * rotate_place->bpp / 8); //
    rotate_place->rotate_device.rotate_idma_core_regs->ITS = (tile_height << 16) + tile_width * rotate_place->bpp / 8; // tile size

    rotate_place->rotate_device.rotate_idma_core_regs->IIH = rotate_place->image_height;

    rotate_place->rotate_device.rotate_iudma_regs->UCR = 5;   // 2 beats, 1 enable
    rotate_place->rotate_device.rotate_iudma_regs->UIER = 0x1f;
    rotate_place->rotate_device.rotate_iudma_regs->UICR = 0xffffffff;
    rotate_place->rotate_device.rotate_idma_core_regs->IICR = 1;
    switch (rotate_place->rotate)
    {
        case 90:
            //
            // For 90 we are going to read from the bottom to the top in column mode.  
            //
            rotate_place->rotate_device.rotate_idma_core_regs->ILW = ((rotate_place->image_width * rotate_place->bpp / 8) << 16) + (rotate_place->image_width * rotate_place->bpp / 8); //
            rotate_place->rotate_device.rotate_idma_core_regs->ICFG = 3;    // bottom to top, column mode
            rotate_place->rotate_device.rotate_idma_core_regs->ITOFFST =  ( tile_width * tile_height * rotate_place->bpp) / 8;
            rotate_place->rotate_device.rotate_idma_core_regs->ITOFFST *= -1;  // since we are reading from bottom to top. the tile offset is negative.
            rotate_place->rotate_device.rotate_idma_core_regs->ISOFFST = tile_width *  rotate_place->bpp / 8;  // swath positive, swaths going left to right
            rotate_place->rotate_device.rotate_idma_core_regs->IIH = strip_height;
            break;
        case 180:
        case 0:
            //
            // for 0, 180 rotate read input from top to bottom in row mode.
            rotate_place->rotate_device.rotate_idma_core_regs->ICFG = 0;    // top to bottom
            rotate_place->rotate_device.rotate_idma_core_regs->ISOFFST =  ( rotate_place->image_width * tile_height * rotate_place->bpp) / 8;
            rotate_place->rotate_device.rotate_idma_core_regs->ITOFFST = tile_width *  rotate_place->bpp / 8;
            break;

        case 270:
            //
            // for 270 read the data from top to bottom in column mode.
            rotate_place->rotate_device.rotate_idma_core_regs->ILW = ((rotate_place->image_width * rotate_place->bpp / 8) << 16) + (rotate_place->image_width * rotate_place->bpp / 8); //
            rotate_place->rotate_device.rotate_idma_core_regs->ICFG = 1;    // top to bottom, column mode.
            rotate_place->rotate_device.rotate_idma_core_regs->ITOFFST =  ( tile_width * tile_height * rotate_place->bpp) / 8;
            rotate_place->rotate_device.rotate_idma_core_regs->ISOFFST = tile_width *  rotate_place->bpp / 8;
            rotate_place->rotate_device.rotate_idma_core_regs->IIH = strip_height;
            break;
        default:
            break;

    }
    if (rotate_place->bpp == 24)
    {
        rotate_place->rotate_device.rotate_idma_core_regs->ICFG |= 4;       // set the bit to do 24 bpp xfers
    }
}

static int config_rotate_core_hw(const struct rotate_and_place_s *const rotate_place)
{
#define	BPP_BITS_START_POS	ROTATE_CORE_CTRL_BPP_SHIFT
#define	ROTATION_BITS_START_POS	ROTATE_CORE_CTRL_ROT_SHIFT
#define	MIRROR_BITS_START_POS	ROTATE_CORE_CTRL_MIRROR_SHIFT
#define	DUAL_IN_BUF_START_POS	ROTATE_CORE_CTRL_DUAL_IN_BUF_SHIFT
#define	IDMA_BOT_TOP_START_POS	ROTATE_CORE_CTRL_BOTTOM_TO_TOP_SHIFT
#define	ROTATE_BYPASS_START_POS	ROTATE_CORE_CTRL_BYPASS_SHIFT

#define	BPP_NONE	((~(0x0<<BPP_BITS_START_POS)))
#define	BPP_BITS_1	((0x00<<BPP_BITS_START_POS))
#define	BPP_BITS_2	((0x01<<BPP_BITS_START_POS))
#define	BPP_BITS_4	((0x02<<BPP_BITS_START_POS))
#define	BPP_BITS_8	((0x03<<BPP_BITS_START_POS))
#define	BPP_BITS_24	((0x04<<BPP_BITS_START_POS))
#define	BPP_BITS_32	((0x05<<BPP_BITS_START_POS))
#define	SET_BPP_BITS(x)	(BPP_BITS_##x)

#define	ROTATION_NONE	((~(0x00<<ROTATION_BITS_START_POS)))
#define	ROTATION_0	((0x00<<ROTATION_BITS_START_POS))
#define	ROTATION_90	((0x01<<ROTATION_BITS_START_POS))
#define	ROTATION_180	((0x02<<ROTATION_BITS_START_POS))
#define	ROTATION_270	((0x03<<ROTATION_BITS_START_POS))
#define	SET_ROTATION_BITS(x)	(ROTATION_##x)

#define	MIRRORING_NONE	((~(0x00<<MIRROR_BITS_START_POS)))
#define	MIRROR_X_1	((0x01<<ROTATION_BITS_START_POS))
#define	MIRROR_X_0	((~(0x01<<ROTATION_BITS_START_POS)))
#define	MIRROR_Y_1	((0x02<<ROTATION_BITS_START_POS))
#define	MIRROR_Y_0	((~(0x02<<ROTATION_BITS_START_POS)))
#define	SET_MIRROR_X(x)	(MIRROR_X_##x)
#define	SET_MIRROR_Y(x)	(MIRROR_Y_##x)

    int rotate_core = 0x00;

    //
    // reset the rotate core.  If we have an odd length image the core can be left in an
    // odd state.  This insures that is cleaned up.
    //
    rotate_place->rotate_device.rotate_core_regs->SOFT_RESET = 1;
    //
    // wait for the reset to complete
    //
    

    switch (rotate_place->bpp)
    {
        case 1:
            rotate_core &= BPP_NONE;
            rotate_core |= SET_BPP_BITS(1);
            break;
        case 2:
            rotate_core &= BPP_NONE;
            rotate_core |= SET_BPP_BITS(2);
            break;
        case 4:
            rotate_core &= BPP_NONE;
            rotate_core |= SET_BPP_BITS(4);
            break;
        case 8:
            rotate_core &= BPP_NONE;
            rotate_core |= SET_BPP_BITS(8);
            break;
        case 24:
            rotate_core &= BPP_NONE;
            rotate_core |= SET_BPP_BITS(24);
            break;
        case 32:
            rotate_core &= BPP_NONE;
            rotate_core |= SET_BPP_BITS(32);
            break;

    }
    switch (rotate_place->rotate)
    {
        case 0:
            rotate_core &= ROTATION_NONE;
            rotate_core |= SET_ROTATION_BITS(0);
            //   rotate_core |= 0x1;
            break;
        case 90:
            //rotate_core &= ROTATION_NONE;
            rotate_core |= SET_ROTATION_BITS(90);
            rotate_core |= 1 << 1;        // go from bottom to top.
            break;
        case 180:
            rotate_core &= ROTATION_NONE;
            rotate_core |= SET_ROTATION_BITS(180);
            rotate_core |= 2;           // go from bottom to top.
            break;
        case 270:
            rotate_core &= ROTATION_NONE;
            rotate_core |= SET_ROTATION_BITS(270);
            break;
    }
    if (rotate_place->flip_y)
    {
        rotate_core |= 0x20;
    } else if (rotate_place->flip_x)
    {
        rotate_core |= 0x10;
    }
    if (rotate_place->flip_y && rotate_place->flip_x)
    {
        DBG_PRINTF_ERR("flip y and x not allowed at same time\n");
        return -1;      // error return not valid
    }
    DBG_PRINTF_INFO("ctrl register %x rotate %d\n", rotate_core, rotate_place->rotate);
    rotate_place->rotate_device.rotate_core_regs->CTRL = rotate_core;
    rotate_place->rotate_device.rotate_core_regs->MASK = rotate_place->mask;
    rotate_place->rotate_device.rotate_core_regs->REPLACE = rotate_place->or_value;
    return 0x00;
}

int rotate_and_place_add_image(struct rotate_and_place_s *rotate_place,
                               int offset_x, int offset_y, int rotate, int flip_x, int flip_y,
                               int page_width, int page_height)
{
    int ret = 0x00, i;
    void *hw_addr;

    // get the input dma descriptors
    if (!iudma_desc)
    {
        iudma_desc = MEM_MALLOC(sizeof(uintptr_t *) * 100);
        if (!iudma_desc) return -ENOMEM;
    }
    //
    // get the output dma descriptors

    if (!oudma_desc)
    {
        oudma_desc = MEM_MALLOC_ALIGN(sizeof(uintptr_t *) * 100,
                                      e_32_byte);
        if (!oudma_desc) return -ENOMEM;
    }

    //DBG_PRINTF_INFO("%s %d: Entry\n", __func__, __LINE__);
    if (!page_width || !page_height ||
        (-EINVAL == is_flip_valid(flip_x)) ||
        (-EINVAL == is_flip_valid(flip_y)) ||
        (flip_x && flip_y) ||
        (-EINVAL == is_offset_valid(offset_x)) ||
        (-EINVAL == is_offset_valid(offset_y)))
    {
        ret = -EINVAL;
        goto out;
    }

    DBG_PRINTF_INFO("%s %d: rotate %d flip_x %d flip_y %d\n", __func__, __LINE__, rotate, flip_x, flip_y);
    rotate_place->rotate = rotate;
    rotate_place->flip_x = flip_x;
    rotate_place->flip_y = flip_y;
    rotate_place->offset_x = offset_x;
    rotate_place->offset_y = offset_y;

    rotate_place->image_width = page_width;
    rotate_place->image_height = page_height;

    // starting a new image, clear out the global image settings
    //
    strip_width_count = 0;

    current_output_line_cnt = 0;
    switch (rotate)
    {
        case 0:
            if (offset_x + page_width > rotate_place->out_total_width)
            {
                ret = INVALID_DST_WIDTH;
                goto out;
            }

            if (offset_y + page_height > rotate_place->out_total_height)
            {
                ret = INVALID_DST_HEIGHT;
                goto out;
            }
            break;

        case 90:
            // setup where the image will start in the output buffers
            if (flip_x)
            {
                strip_width_count = 0;
            } else
            {
                strip_width_count = rotate_place->image_height;
            }

            if (offset_y + page_width > rotate_place->out_total_height)
            {
                ret = INVALID_DST_WIDTH;
                DBG_PRINTF_ERR("%s %d: offset_y %d page_width %d height %d\n", __func__, __LINE__, offset_y, page_width, rotate_place->out_total_height);
                goto out;
            }
            break;
        case 180:

            if (offset_x + page_width > rotate_place->out_total_width)
            {
                ret = INVALID_DST_WIDTH;
                goto out;
            }

            if (offset_y + page_height > rotate_place->out_total_height)
            {
                ret = INVALID_DST_HEIGHT;
                goto out;
            }
            if (!flip_x)
            {
                strip_width_count = rotate_place->image_width;
            } else
            {
                strip_width_count = 0;
            }
            break;
        case 270:
            if (flip_x)
            {
                strip_width_count = rotate_place->image_height;
            } else
            {
                strip_width_count = 0;
            }

            if (offset_x + page_height > rotate_place->out_total_width)
            {
                ret = INVALID_DST_HEIGHT;
                goto out;
            }

            if (offset_y + page_width > rotate_place->out_total_height)
            {
                ret = INVALID_DST_WIDTH;
                goto out;
            }

            break;
        default:
            ret = INVALID_ROTATION;
            goto out;
            break;
    }
    rotate_place->dst_hw_addr_list = (uint32_t *)
        MEM_MALLOC(sizeof(uint32_t) * rotate_place->num_strips);

    for (i = 0; i < rotate_place->num_strips; i++)
    {
        hw_addr = dma_buffer_map_single(
            rotate_place->out_buf_strip[i],
            DMA_FROM_DEVICE);
        rotate_place->dst_hw_addr_list[i] = (uint32_t)hw_addr;
        DBG_PRINTF_INFO("%s %d: rotate_place->dst_hw_addr_list[k]=0x%x strip# %d\n", __func__, __LINE__, rotate_place->dst_hw_addr_list[i], i);
    }
    DBG_PRINTF_INFO("%s %d: offset_x = %d\n", __func__, __LINE__, offset_x);
    DBG_PRINTF_INFO("%s %d: offset_y = %d\n", __func__, __LINE__, offset_y);
    DBG_PRINTF_INFO("%s %d: page_width = %d\n",
                    __func__, __LINE__, page_width);
    DBG_PRINTF_INFO("%s %d: page_height = %d\n",
                    __func__, __LINE__, page_height);
    DBG_PRINTF_INFO("%s %d: rotate_place->out_total_width = %d\n",
                    __func__, __LINE__, rotate_place->out_total_width);
    DBG_PRINTF_INFO("%s %d: rotate_place->out_total_height = %d\n",
                    __func__, __LINE__, rotate_place->out_total_height);


out:
    DBG_PRINTF_INFO("%s %d: Entry\n", __func__, __LINE__);
    return ret;
}
static void allocate_new_desc(uint32_t *num_desc, uint32_t cur_desc_cnt, struct rotate_udma_desc_s *desc[], uint32_t desc_num)
{
    if (cur_desc_cnt >= *num_desc)
    {
        desc[cur_desc_cnt] = rotate_save_desc[desc_num];
        ASSERT(desc[cur_desc_cnt] != NULL);
        memset(desc[cur_desc_cnt], 0,
               sizeof(struct rotate_udma_desc_s));
        desc[cur_desc_cnt]->this_hw_addr = rotate_save_addr[desc_num];
    }
    *num_desc = *num_desc + 1;
}

int execute_the_chain(struct rotate_and_place_s *rotate_place, uint32_t internal_strip_height, uint32_t input_column_width, uint32_t total_columns , uint32_t strip_height)
{
    struct timespec ts;
    int err;
    //
    // configure the hardware.
    //
    DBG_PRINTF_INFO("execute - strip_height %d input_column_width %d\n", internal_strip_height, input_column_width);
    if (config_rotate_core_hw(rotate_place) != 0)
    {
        DBG_PRINTF_INFO("Error return\n");
        return -1;      // fail setting bad.
    }

    config_idma_core_hw(rotate_place, internal_strip_height, input_column_width, strip_height);

    config_odma_core_hw(rotate_place, internal_strip_height, input_column_width, total_columns);


    if (DBG_WOULD_PRINTF(LOG_DEBUG)) 
    {
        reg32_dump_named((char *)rotate_place->rotate_device.rotate_core_regs, sizeof(ROTATE_CORE_REGS_t), "rotate core regs");

        reg32_dump_named((char *)rotate_place->rotate_device.rotate_iudma_regs,
                         sizeof(ROTATE_IDMA_ROTATE_IDMA_CORE_REGS_t), "rotate_iudma_core_reg1");

        reg32_dump_named((char *)rotate_place->rotate_device.rotate_idma_core_regs,
                         sizeof(ROTATE_IDMA_ROTATE_IDMA_CORE_REGS_t), "rotate_idma_core_reg");

        reg32_dump_named((char *)rotate_place->rotate_device.rotate_oudma_regs,
                         sizeof(ROTATE_IDMA_ROTATE_IDMA_CORE_REGS_t), "rotate_oudma_core_reg1");
        reg32_dump_named((char *)rotate_place->rotate_device.rotate_odma_core_regs,
                         sizeof(ROTATE_ODMA_ROTATE_ODMA_CORE_REGS_t), "rotate_odma_core_reg");
    }

    got_interrupt = 0;
    // start the idma
    rotate_place->rotate_device.rotate_iudma_regs->UDR = (uint32_t)iudma_desc[0]->this_hw_addr;
    // start the odma
    rotate_place->rotate_device.rotate_oudma_regs->UDR = (uint32_t)oudma_desc[0]->this_hw_addr;
    // wait for the interrupt
    // The #if below is good for debugging.  Set to 0 and if the dma does not finish, it will hang.
    #if 1   
    if (clock_gettime(CLOCK_REALTIME, &ts) == -1)
    {
        perror("clock_gettime");
        DBG_PRINTF_INFO("Error1 return\n");
        return -1;
    }
    ts.tv_sec += 4;         // do a 2 second timeout.
                            //
                            // wait for the interrupt
                            //
    err = sem_timedwait(&interrupt_sema, &ts);
    if (err == -1)
    {
        DBG_PRINTF_ERR("Error, semaphore timed out\n");
        DBG_PRINTF_INFO("Error2 return %d\n", err);

        reg32_dump_named((char *)rotate_place->rotate_device.rotate_iudma_regs,
                         sizeof(ROTATE_IDMA_ROTATE_IDMA_CORE_REGS_t), "rotate_iudma_core_reg1");

        reg32_dump_named((char *)rotate_place->rotate_device.rotate_idma_core_regs,
                         sizeof(ROTATE_IDMA_ROTATE_IDMA_CORE_REGS_t), "rotate_idma_core_reg");

        reg32_dump_named((char *)rotate_place->rotate_device.rotate_oudma_regs,
                         sizeof(ROTATE_IDMA_ROTATE_IDMA_CORE_REGS_t), "rotate_oudma_core_reg1");
        reg32_dump_named((char *)rotate_place->rotate_device.rotate_odma_core_regs,
                         sizeof(ROTATE_ODMA_ROTATE_ODMA_CORE_REGS_t), "rotate_odma_core_reg");

        DBG_PRINTF_INFO("&&&&&&&&&&&&&&&&&&&&&end of error\n");
        return -1;
    }
    #else
    err = sem_wait(&interrupt_sema);
    #endif
    //
    // if got_interrupt is non-zero, we had an error that needs to be reported.
    // Currently only reports length errors.
    //
    if (got_interrupt != 0)
    {
        DBG_PRINTF_ERR("odma interrupt length error\n");
    }
    return 0;
}
/**
 * @brief take an input strip and rotate and place it in the 
 *        output.
 * This takes the input strip, and if the strip has to many 
 * lines it breaks the strip into smaller strips.  Then along 
 * the broken down strip descriptors are made and run for each 
 * tile in the strip. repeat till input strip is complete.
 * 
 * 
 * @return int 
 * @retval 0 successful 
 * @retval -1 failure. 
 */
int rotate_and_place_add_strip(struct rotate_and_place_s *rotate_place, uint32_t strip_height,
                               struct BigBuffer_s *in_buf_strip)
{
    uint32_t input_tile_width, input_column_width;
    uint32_t calc_input_tile_width;  // this is the tile width we calculated
    uint32_t cur_idma_desc_cnt;
    uint32_t cur_odma_desc_cnt;
    uint32_t line_within_strip;     // where within the output strip the line starts for output dma
    uint32_t start_strip;           // the output strip where output dma starts.
    uint32_t processed_columns;
    bool odma_strip_direction;    // true top to bottom, false bottom to top
    bool odma_row_direction;  //  true right to left, false left to right
    uint32_t internal_strip_height; // the number of input lines being processed
    uint32_t processed_input_lines;
    uint32_t tile_height;           // this is the size of our tile.
    void *bb_hw_addr;
    bool strip_complete;        // when true the current strip is complete
    int i;
    

    //
    // Take this input strip and break it up into strips, and process each strip in turn.
    // For 0 and 180 rotates this loop is run as many times as necessary to process all 
    // the input lines.  For 90 and 270, much fewer times, usually 1 or 2 times because lines 
    // fill strips.
    processed_input_lines = 0;

    while (processed_input_lines < strip_height)
    {
        DBG_PRINTF_INFO("processed %d, strip_height %d\n", processed_input_lines, strip_height);
        //
        // get the physical address of the in buffer
        //
        bb_hw_addr = dma_buffer_map_single(in_buf_strip, DMA_TO_DEVICE);
        // space to where this strip starts.
        bb_hw_addr += processed_input_lines * rotate_place->image_width * rotate_place->bpp / 8;
        //
        // now figure out how many lines to do on this pass.
        // This is the strip height we will process on this pass.
        //
        if (strip_height - processed_input_lines > rotate_place->out_strip_height)
        {
            internal_strip_height = rotate_place->out_strip_height;

        } else
        {
            internal_strip_height = strip_height - processed_input_lines;

        }
        //
        // figure the number of colums and the row and strip direction for each rotation.
        //
        DBG_PRINTF_INFO("%s %d: rotate degree %d internal_strip_height %d\n", __func__, __LINE__, rotate_place->rotate, internal_strip_height);
        switch (rotate_place->rotate)
        {
            case 0:
                // the 0 rotate input runs from top to bottom and left to right.
                // The output runs from top to bottom and left to right
                // figure out where this starts
                input_tile_width = 128;
                tile_height = MAX_LINES_PROCESSED;
                if (input_tile_width > rotate_place->image_width)
                {
                    // this is very small, use the smaller size
                    input_tile_width = rotate_place->image_width;
                }
                odma_strip_direction = true;   // top to bottom
                odma_row_direction = true;     // left to right
                if (rotate_place->flip_y)
                {
                    odma_row_direction = false;
                    tile_height = 1;        // only process a single line at a time
                }
                //
               
                if (rotate_place->flip_x)
                {
                    odma_strip_direction = false;
                    // Set up for bottom to top placement
                    internal_strip_height = (rotate_place->offset_y + rotate_place->image_height - current_output_line_cnt) % 
                        rotate_place->out_strip_height;
                } else
                {
                    // figure the number of lines of the input strip to do.
                    // for top to bottom placement

                    internal_strip_height = rotate_place->out_strip_height - (rotate_place->offset_y + current_output_line_cnt) % rotate_place->out_strip_height;
                }
                // 
                // If the internal_strip_height is 0, because of mod this is actually the out_strip_height.
                if (internal_strip_height == 0)
                {
                    internal_strip_height = rotate_place->out_strip_height;
                }
                // if the number of lines to do in the output strip is less than the 
                // number of lines left in the input strip, limit the # of lines.
                if (internal_strip_height > strip_height - processed_input_lines)
                {
                    internal_strip_height = strip_height - processed_input_lines;
                }
                break;
            case 180:
                //
                // 180 the input runs from the bottom to the top, output goes from
                // top to bottom.  When flip_x is set the output goes from bottom to top.
                // When flip_y is set the output goes from left to right.
                //
                input_tile_width = 128;
                tile_height = MAX_LINES_PROCESSED;
                if (input_tile_width > rotate_place->image_width)
                {
                    input_tile_width = rotate_place->image_width;
                }
                odma_strip_direction = false;   // bottom to top
                odma_row_direction = false;     // right to left
                                                //
                                                // this can be adjusted to do more lines, the src output address must be adjusted to go to
                                                // the last tile in the strip.
                tile_height = 1;        // only process a single line at a time for 180.  Since this is done by the hw there is no 
                                        // performance issue.
                if (rotate_place->flip_y)
                {              
                    odma_row_direction = true;
                }
                if (rotate_place->flip_x)
                {
                    odma_strip_direction = true;
                    internal_strip_height = rotate_place->out_strip_height - (rotate_place->offset_y + current_output_line_cnt) % rotate_place->out_strip_height;

                } else
                {
                    // Set up for bottom to top placement
                    internal_strip_height = (rotate_place->offset_y + rotate_place->image_height - current_output_line_cnt) % 
                        rotate_place->out_strip_height;
                    DBG_PRINTF_INFO("flip_x false internal_strip_height %d\n", internal_strip_height);
                }
                // 
                // If the internal_strip_height is 0, because of mod this is actually the out_strip_height.
                if (internal_strip_height == 0)
                {
                    internal_strip_height = rotate_place->out_strip_height;
                }
                // if the number of lines to do in the output strip is less than the 
                // number of lines left in the input strip, limit the # of lines.
                if (internal_strip_height > strip_height - processed_input_lines)
                {
                    internal_strip_height = strip_height - processed_input_lines;
                }
                if (!rotate_place->flip_x)
                {

                    // for this direction must back up by a strip so we start the dma in the correct place.
                    strip_width_count -= input_tile_width;
                }
                break;
            case 90:
                //
                // For 90 the input is read bottom to top in column mode.  So it reads a tile width, then steps up
                // to the next tile and reads till a tile is complete, it is then rotated and sent out.
                // The output is processed from left to right and top to bottom.  Flip_x and flip_y 
                // swap the directions.
                DBG_PRINTF_INFO("%s %d: test\n", __func__, __LINE__);
                //
                // the rotate block has only a limited number of columns it can do, 64 bytes total.
                // limit the size of the column strip here to fit in that restriction.
                internal_strip_height = strip_height;       // the entire height must fit into the output buffer, or error.
                tile_height = MAX_LINES_PROCESSED;
                switch (rotate_place->bpp)
                {
                    case 24:
                    case 32:
                        input_tile_width = 16;
                        break;
                    case 8:
                    case 4: 
                        input_tile_width = 64;
                        break;
                    case 2:
                        input_tile_width = 128;
                        break;
                    case 1:
                        input_tile_width = 256;
                        break;
                }
                DBG_PRINTF_INFO("%s %d: input_tile_width %d\n", __func__, __LINE__, input_tile_width);
                if (input_tile_width > rotate_place->out_strip_height)
                {
                    input_tile_width = rotate_place->out_strip_height;
                }
                // setup directions based on flip x and y
                odma_strip_direction = true;        // top to bottom
                if (rotate_place->flip_y)
                {
                    odma_strip_direction = false;   // bottom to top
                }
                DBG_PRINTF_INFO("%s %d: test\n", __func__, __LINE__);
                odma_row_direction = false;
                if (rotate_place->flip_x != 0)
                {
                    odma_row_direction = true;
                } else
                {
                    // for this direction must back up by a strip so we start the dma in the correct place.
                    strip_width_count -= internal_strip_height;
                }
                if (strip_width_count < 0)
                {
                    DBG_PRINTF_INFO("%s %d: strip_width_count %d\n", __func__, __LINE__, strip_width_count);
                    DBG_PRINTF_ERR("problem with width count\n");
                    return -1;
                }
                break;
            case 270:
                //
                // the rotate block has only a limited number of colums it can do, 64 bytes total.
                // limit the size of the column strip here to fit in that restriction.
                switch (rotate_place->bpp)
                {
                    case 24:
                    case 32:
                        input_tile_width = 16;
                        break;
                    case 8:
                    case 4:
                        input_tile_width = 64;
                        break;
                    case 2:
                        input_tile_width = 128;
                        break;
                    case 1:
                        input_tile_width = 256;
                        break;
                }
                // make sure the number of columns is within correct range.
                if (input_tile_width > rotate_place->out_strip_height)
                {
                    input_tile_width = rotate_place->out_strip_height;
                }
                internal_strip_height = strip_height;       // the entire height must fit into the output buffer, or error.
                tile_height = MAX_LINES_PROCESSED;
                // now set up the direction stuff
                odma_strip_direction = false;
                if (rotate_place->flip_y)
                {
                    odma_strip_direction = true;
                }
                odma_row_direction = true;
                if (rotate_place->flip_x)
                {
                    odma_row_direction = false;
                    strip_width_count -= internal_strip_height;
                }
                break;
        }
        calc_input_tile_width = input_tile_width;

        DBG_PRINTF_INFO("$$$$$$$$$$$$$$$ internal_strip_height %d \n", internal_strip_height);
        //
        // if the tile height is more than the strip height, only do a tile height amount.
        //
        if (tile_height > internal_strip_height)
        {
            tile_height = internal_strip_height;
        }
        DBG_PRINTF_INFO("%s %d: image_width %d out_strip_height %d\n", __func__, __LINE__, 
                        rotate_place->image_width, rotate_place->out_strip_height);
        DBG_PRINTF_INFO("%s %d: strip_width_count %d \n", __func__, __LINE__, strip_width_count);
        //
        // for each strip, setup the xfer to use the hw tile and swath features.
        // This loop is run a single time for each pass of the outer loop for
        // 0 and 180 rotates.  For 90 and 270 rotations this is run many times
        // for each pass of the outer loop. 
        // One time for each output strip that is touched by an input
        // strip
        //
        strip_complete = false;
        processed_columns = 0;

        while (!strip_complete)
        {
            input_tile_width = calc_input_tile_width;
            // default values for the mask and or values.
            rotate_place->mask = 0xffffffff;  // the mask value for the rotate block
            rotate_place->or_value = 0;       // the or value for the rotate block
            DBG_PRINTF_INFO("strip direction %d row direction %d\n", odma_strip_direction, odma_row_direction);
            DBG_PRINTF_INFO("input hw address 0x%x\n", bb_hw_addr);
            //
            // setting these to zero means we will re-use old descriptors.
            //
            cur_odma_desc_cnt = 0;
            cur_idma_desc_cnt = 0;

            allocate_new_desc(&num_idma_desc, cur_idma_desc_cnt, iudma_desc, ROTATE_DESC_SAVE_IUDMA);
            //
            // Now setup the idma descriptor.
            // for this descriptor we want to always set it up the same, except in the case of 90 rotate.
            //
            iudma_desc[cur_idma_desc_cnt]->src = (uint32_t)(bb_hw_addr + processed_columns *  rotate_place->bpp / 8);
            iudma_desc[cur_idma_desc_cnt]->source_lo0 = iudma_desc[cur_idma_desc_cnt]->src;

            //
            // scan from bottom to top for 180 rotate.  Make changes here to the address
            if (rotate_place->rotate == 180)
            {
                bb_hw_addr = dma_buffer_map_single(in_buf_strip, DMA_TO_DEVICE);
                iudma_desc[cur_idma_desc_cnt]->source_lo0 = (uint32_t) bb_hw_addr;
                // space down to the start of the last strip to run
                iudma_desc[cur_idma_desc_cnt]->src = (uint32_t)bb_hw_addr + (processed_input_lines) * rotate_place->image_width * rotate_place->bpp / 8;
            }
            //
            // setup the buffer pointers.
            iudma_desc[cur_idma_desc_cnt]->source_hi0 = iudma_desc[cur_idma_desc_cnt]->src +
                rotate_place->image_width * strip_height * rotate_place->bpp / 8;
            // setup the control pointers.
            iudma_desc[cur_idma_desc_cnt]->control =  (1 << 24) | 3;
            if (rotate_place->rotate == 90)
            {
                // space down to the start of the last line in the strip.
                // for 90 we read from the bottom to the top so we need to point
                // to the bottom of the data to xfer.
                DBG_PRINTF_INFO("90 rotate bpp %d height %d width %d\n", rotate_place->bpp, rotate_place->image_height, rotate_place->image_width);
                iudma_desc[cur_idma_desc_cnt]->src += (rotate_place->image_width * rotate_place->bpp * (strip_height - 1)) / 8;
            }
            iudma_desc[cur_idma_desc_cnt]->length = (rotate_place->image_width * rotate_place->bpp * (strip_height)) / 8;
            // we will finish the idma descriptor later.  The length field may change because
            // of the number of columns we may process
            //
            // now do the odma descriptor.  allocate_new_desc, clears out the descriptor we want to use.
            //
            allocate_new_desc(&num_odma_desc, cur_odma_desc_cnt, oudma_desc, ROTATE_DESC_SAVE_OUDMA);
            //
            // 0 and 180 are handled different than 90 and 270, make that switch here.
            //
            if (rotate_place->rotate == 180 || rotate_place->rotate == 0)
            {
                // setup the odma descriptor for 0 and 180
                // figure the number of columns to do this time.
                input_column_width = rotate_place->image_width - processed_columns;  // this should be the entire width of the 
                                // input image.
                strip_complete = true;  // this is the last tile since we will do the entire strip on this loop.
                                        // build the descriptors for 180 or 0. based on the direction of the
                                        // dma xfers.
                                        //
                DBG_PRINTF_INFO("strip_cirection %d\n", odma_strip_direction);
                DBG_PRINTF_INFO("current_output_line_cnt %d\n", current_output_line_cnt);
                if (odma_strip_direction)
                {
                    // this takes strips from the top to bottom of the page
                    start_strip = (current_output_line_cnt + rotate_place->offset_y) / // this is current index
                        rotate_place->out_strip_height;
                    line_within_strip = (current_output_line_cnt + rotate_place->offset_y) % // this is current index
                        rotate_place->out_strip_height;
                } else
                {

                    // setup for strips from the bottom to top of page.
                    start_strip = (rotate_place->offset_y + rotate_place->image_height - // very end of the image 
                                   current_output_line_cnt -tile_height) / // this is current index, back up by a tile height to insure
                                                                           // we start at the correct place.
                        rotate_place->out_strip_height;                    // divided by the strip height.
                    line_within_strip = (rotate_place->offset_y + rotate_place->image_height -
                                         current_output_line_cnt - tile_height) % // this is current index
                        rotate_place->out_strip_height;
                    DBG_PRINTF_INFO("internal_strip_height %d\n", internal_strip_height);
                }
                // get the hw address for the source
                oudma_desc[cur_odma_desc_cnt]->src = (uint32_t)rotate_place->dst_hw_addr_list[start_strip];
                // initialize the low and high address.
                oudma_desc[cur_odma_desc_cnt]->source_lo0 = oudma_desc[cur_odma_desc_cnt]->src;
                oudma_desc[cur_odma_desc_cnt]->source_hi0 = oudma_desc[cur_odma_desc_cnt]->src;
                // now increment to the line we want to use
                oudma_desc[cur_odma_desc_cnt]->src += (rotate_place->out_total_width * rotate_place->bpp * line_within_strip) / 8;
                // adjust for the offset_x setting
                oudma_desc[cur_odma_desc_cnt]->src += (rotate_place->offset_x * rotate_place->bpp) / 8;  // add in the offset x value
                                                                                                         //
                                                                                                         // now based on the direction we want to process the rows, move the source
                                                                                                         //
                if (!odma_row_direction)
                {
                    DBG_PRINTF_INFO("image_width %d processed_col %d input_tile_width %d\n", rotate_place->image_width, processed_columns,
                           input_tile_width);
                    oudma_desc[cur_odma_desc_cnt]->src += ((rotate_place->image_width - processed_columns - input_tile_width) *
                                                           rotate_place->bpp) / 8;  // add in x column offset due to already processed columns.
                } else
                {

                    oudma_desc[cur_odma_desc_cnt]->src += (processed_columns *
                                                           rotate_place->bpp) / 8;  // add in x column offset due to already processed columns.
                }
                // setup the lo0 and hi0 to span the entire buffer we will be accessing in dma.
                oudma_desc[cur_odma_desc_cnt]->source_hi0 += (rotate_place->out_strip_height * rotate_place->out_total_width * rotate_place->bpp) / 8;


                // do the lengths to xfer.
                oudma_desc[cur_odma_desc_cnt]->length = input_column_width * internal_strip_height * rotate_place->bpp / 8;
                iudma_desc[cur_idma_desc_cnt]->length = input_column_width * internal_strip_height * rotate_place->bpp / 8;
                if (rotate_place->flip_x && rotate_place->rotate == 0)
                {
                    // point to the beginning of the last line in the strip
                    oudma_desc[cur_odma_desc_cnt]->src += (internal_strip_height - 1) * rotate_place->out_total_width * rotate_place->bpp / 8;
                    tile_height = 1;        // only do one at a time
                }
            } else
            {
                // this section sets up the output descriptors for 90 and 270 rotations.
                //
                // Figure the start output strip number and the line within the strip to start.
                // depends on the direction we are running.
                // input_column_width is the number of input columns to process.  For these 2 rotations it is also
                // the number of lines in the output buffer.
                //
                if (odma_strip_direction)
                {
                    // figure the number of columns to do.
                    input_column_width = rotate_place->out_strip_height - ((rotate_place->offset_y + processed_columns) % rotate_place->out_strip_height);
                    DBG_PRINTF_INFO("input_column_width %d\n",input_column_width);
                    // check for the last time through and fix
                    if (input_column_width >= (rotate_place->image_width - processed_columns))
                    {
                        input_column_width = rotate_place->image_width - processed_columns;
                        strip_complete = true;      // all done, get out when this is done
                    }

                    start_strip = (processed_columns + rotate_place->offset_y) / rotate_place->out_strip_height;
                    line_within_strip = (processed_columns + rotate_place->offset_y) % // this is current index
                        rotate_place->out_strip_height;
                    DBG_PRINTF_INFO("input_tile_width %d input_column_width %d\n", input_tile_width, input_column_width);
                    //
                    // We want to process input_tile_width of input, but if we only have input_column_width's worth of columns, we must limit the amount we want to do.
                    if (input_tile_width > input_column_width)
                    {
                        input_tile_width = input_column_width;
                    }

                    if (input_column_width % input_tile_width)
                    {
                        input_column_width = (input_column_width / input_tile_width) * input_tile_width;    // limit the amount we do.
                        strip_complete = false;         // since we are cutting down the expected xfer, the strip is not complete.
                    }
                    DBG_PRINTF_INFO("input_tile_width %d input_column_width %d strip_complete %d\n", input_tile_width, input_column_width, strip_complete);

                } else
                {
                    // figure the number of columns to do of the input, which is number of lines in the output.
                    input_column_width = ((rotate_place->offset_y - processed_columns + rotate_place->image_width/*lena fix*/) % rotate_place->out_strip_height);
                    if (!input_column_width)
                    {
                        input_column_width = rotate_place->out_strip_height;
                    }
                    DBG_PRINTF_INFO("my tmp col %d input_tile_width %d\n", input_column_width, input_tile_width); 
                    // on the last column, need to check to see what the excess is
                    if (input_column_width >= (rotate_place->image_width - processed_columns))
                    {
                        input_column_width = rotate_place->image_width - processed_columns;
                        strip_complete = true;      // all done, get out
                    }
                    // figure the start strip, since we are going from bottom to top the start strip and line must be the beginning of the block
                    // thus we take out the height of our strip, which is input_column_width to get the correct starting location.
                    start_strip = (rotate_place->image_width + rotate_place->offset_y - processed_columns - input_column_width) / rotate_place->out_strip_height;
                    line_within_strip = (rotate_place->image_width + rotate_place->offset_y - processed_columns - input_column_width) % // this is current index
                        rotate_place->out_strip_height;


                    //
                    // We want to process input_tile_width of input, but if we only have input_column_width's worth of columns, we must limit the amount we want to do.
                    if (input_tile_width > input_column_width)
                    {
                        input_tile_width = input_column_width;
                    }
                    if (input_column_width % input_tile_width)
                    {
                        // the tiles are not equal size, process the last, small strip before the others.
                        // this is done now since we want to process in order to make the code simpler.
                        //
                        int num_lines_to_skip = (input_column_width / input_tile_width) * input_tile_width;

                        input_column_width = input_column_width - num_lines_to_skip;
                        line_within_strip += num_lines_to_skip;
                        input_tile_width = input_column_width;
                        strip_complete = false;         // since we are cutting down the expected xfer, the strip is not complete.

                    }
                }
                //
                // get the hw address of the start of the buffer we want.
                oudma_desc[cur_odma_desc_cnt]->src = (uint32_t)rotate_place->dst_hw_addr_list[start_strip];

                oudma_desc[cur_odma_desc_cnt]->source_lo0 = oudma_desc[cur_odma_desc_cnt]->src;
                //
                //  now space to the correct line within the strip
                oudma_desc[cur_odma_desc_cnt]->src += (rotate_place->out_total_width * rotate_place->bpp * line_within_strip) / 8;

                //
                // space down based on what has been done before, our new starting point
                oudma_desc[cur_odma_desc_cnt]->src += (strip_width_count * rotate_place->bpp) / 8;
                //
                // adjust for the offset_x setting
                oudma_desc[cur_odma_desc_cnt]->src += (rotate_place->offset_x * rotate_place->bpp) / 8;  // add in the offset x value
                                                                                                         //
                                                                                                         // setup lo0 and hi0 to spac our output buffer.
                                                                                                         //
                oudma_desc[cur_odma_desc_cnt]->source_hi0 = oudma_desc[cur_odma_desc_cnt]->src;
                DBG_PRINTF_INFO("colums %d total width %d bpp %d\n", input_tile_width, rotate_place->out_total_width, rotate_place->bpp);

                oudma_desc[cur_odma_desc_cnt]->source_hi0 += rotate_place->out_strip_height * rotate_place->out_total_width * rotate_place->bpp / 8;


                if ((odma_row_direction && rotate_place->rotate == 90) ||
                    (!odma_row_direction && rotate_place->rotate == 270))
                {
                    // for these 2 cases we are putting data down from the right to the left.  space over
                    // to the first line in the last tile of the strip.
                    oudma_desc[cur_odma_desc_cnt]->src += (internal_strip_height - tile_height) * rotate_place->bpp / 8;
                }
                if (!odma_strip_direction)
                {
                    // for this case we need to space down to the first line of the last tile in this strip.
                    DBG_PRINTF_INFO("before add %x\n", oudma_desc[cur_odma_desc_cnt]->src);
                    oudma_desc[cur_odma_desc_cnt]->src += (input_column_width - input_tile_width) * rotate_place->out_total_width * rotate_place->bpp / 8;
                    DBG_PRINTF_INFO("int_strip_height %d tile_height %d num_col %d\n", internal_strip_height, tile_height, input_tile_width);
                    DBG_PRINTF_INFO("excess added %x\n", input_column_width * rotate_place->out_total_width * rotate_place->bpp / 8);
                }
                iudma_desc[cur_idma_desc_cnt]->length = input_column_width * internal_strip_height * rotate_place->bpp / 8;
                // do the lengths to xfer.
                oudma_desc[cur_odma_desc_cnt]->length = iudma_desc[cur_idma_desc_cnt]->length;
                DBG_PRINTF_INFO("internal_strip_height %d\n", internal_strip_height);
                // check to make sure the length is divisible by 4, if not fix so it works correctly
                if ((input_column_width * rotate_place->bpp / 8) % 4)
                {
                    //
                    // Our xfer will not end on a word boundary, change the number of bytes to make sure we xfer enough
                    // data to allow the dma's to finish.
                    iudma_desc[cur_idma_desc_cnt]->length += (4 - ((input_column_width * rotate_place->bpp / 8) % 4)) * internal_strip_height;
                    //
                    // since we don't end on a word boundary, setup the mask and replace registers to mask the end of the lines.
                    //
                    for (i = 0; i < 4 - ((input_column_width * rotate_place->bpp / 8) % 4); i++)
                    {
                        rotate_place->mask &= ~(0xff << (i * 8));
                        if (rotate_place->bpp > 8)
                        {
                            rotate_place->or_value |= 0xff << (i * 8);
                        }
                    }
                }
            }
            DBG_PRINTF_INFO("processed_columns %d\n", processed_columns);
            DBG_PRINTF_INFO("internal_strip_height %d bpp %d\n", internal_strip_height, rotate_place->bpp);
            DBG_PRINTF_INFO("start_strip %d line_within %d input_column_width %d width count %d\n", start_strip, line_within_strip, input_column_width, strip_width_count);
            if (start_strip > rotate_place->num_strips)
            {
                DBG_PRINTF_INFO("Error start strip too big start_strip %d num_strips %d\n", start_strip, rotate_place->num_strips);
                return -1;
            }
            //
            // error check.  If these values are bogus, get out
            if (oudma_desc[cur_odma_desc_cnt]->length == 0 ||
                (oudma_desc[cur_odma_desc_cnt]->src < (uint32_t)rotate_place->dst_hw_addr_list[start_strip]))
            {
                DBG_PRINTF_ERR("zero length input_column_width %d internal_strip_height %d\n", input_column_width, internal_strip_height);
                DBG_PRINTF_ERR("oudma->length %d hw->len %d\n", oudma_desc[cur_odma_desc_cnt]->length, rotate_place->out_buf_strip[start_strip]->datalen);
                DBG_PRINTF_ERR("iudma_desc->length->src %x hwaddr %x\n", oudma_desc[cur_odma_desc_cnt]->src, (uint32_t)rotate_place->dst_hw_addr_list[start_strip]);
                return -1;
            }
            //
            // If we have more than one out descriptor 
            if (cur_odma_desc_cnt)
            {
                oudma_desc[cur_odma_desc_cnt - 1]->next = (uint32_t) oudma_desc[cur_odma_desc_cnt]->this_hw_addr;
                iudma_desc[cur_idma_desc_cnt - 1]->next = (uint32_t) oudma_desc[cur_idma_desc_cnt]->this_hw_addr;
            }
            oudma_desc[cur_odma_desc_cnt]->control =  (1 << 24) | 3;    // control always the same
            if (DBG_WOULD_PRINTF(LOG_DEBUG))
            {

                reg32_dump_named((char *)iudma_desc[cur_idma_desc_cnt], sizeof(struct rotate_udma_desc_s), "in descriptor");

                reg32_dump_named((char *)oudma_desc[cur_odma_desc_cnt], sizeof(struct rotate_udma_desc_s), "out descriptor");
            }
            DBG_PRINTF_INFO("Tile height %d, num_col %d tmp_col %d\n", tile_height,input_tile_width, input_column_width);
            execute_the_chain(rotate_place, tile_height, input_tile_width, input_column_width, strip_height);

            cur_odma_desc_cnt++;
            cur_idma_desc_cnt++;
            processed_columns += input_column_width;    // finished these columns, go do the next set.
            DBG_PRINTF_INFO("%s %d: current_input_line_cnt %d\n", __func__, __LINE__, current_output_line_cnt);
        }
        current_output_line_cnt += internal_strip_height;   // finished this set of output lines
                                                            //
                                                            // based on direction and rotation, the strip width count must be incremented.
        if ((rotate_place->rotate == 270 && odma_row_direction) ||
            (rotate_place->rotate == 90 && odma_row_direction))
        {
            // if we are doing a 270 or 90 rotate then we need to increment the width count so we progress down the strip
            // strip width count is global, keeps track of where we are running
            strip_width_count += internal_strip_height;
        }
        processed_input_lines += internal_strip_height;     // processed_input_lines is local, re-inited on each input strip
        DBG_PRINTF_INFO("after add processed_input_lines %d\n", processed_input_lines);
    }
    return 0;
}

struct rotate_and_place_s* rotate_and_place_close(
    struct rotate_and_place_s *rotate_place)
{
    DBG_PRINTF_INFO("%s %d: Entry\n", __func__, __LINE__);
    if (rotate_place)
    {
        uninitialize_rotate_hardware(
            &rotate_place->rotate_device);
        rotate_place->rotate_device.rotate_iudma_regs->UCR = 0;
        rotate_place->rotate_device.rotate_oudma_regs->UCR = 0;
        MEM_FREE_AND_NULL(rotate_place->dst_hw_addr_list);
		{
			struct rotate_device *device = &rotate_place->rotate_device;

			if (device->rotate_odma_core_regs) {
			    unMapMem(device->rotate_odma_core_regs,
			             sizeof(ROTATE_ODMA_ROTATE_ODMA_CORE_REGS_t));
			    device->rotate_odma_core_regs = NULL;
			}
			if (device->rotate_oudma_regs) {
			    unMapMem(device->rotate_oudma_regs,
			             sizeof(ROTATE_ODMA_ROTATE_ODMA_UDMA_REGS_t));
			    device->rotate_oudma_regs = NULL;
			}
			if (device->rotate_idma_core_regs) {
			    unMapMem(device->rotate_idma_core_regs,
			             sizeof(ROTATE_IDMA_ROTATE_IDMA_CORE_REGS_t));
			    device->rotate_idma_core_regs = NULL;
			}
			if (device->rotate_iudma_regs) {
			    unMapMem(device->rotate_iudma_regs,
			             sizeof(ROTATE_IDMA_ROTATE_IDMA_UDMA_REGS_t));
			    device->rotate_iudma_regs = NULL;
			}
			if (device->rotate_core_regs) {
			    unMapMem(device->rotate_core_regs, sizeof(ROTATE_CORE_REGS_t));
			    device->rotate_core_regs = NULL;
			}

		}
	    MEM_FREE_AND_NULL(rotate_place_singleton);
        DBG_PRINTF_INFO("%s %d: Entry\n", __func__, __LINE__);
    }
    return NULL;
}


