OAG Library (OpenGL) Part 1 - Setting Up the Library for an MFC Application

Eduardo Tucci

Rate me:

4.40/5 (11 votes)

7 Aug 2011CPOL3 min read

56.3K

OAG is a library written in C++. With this library, you can create OpenGL based applications.

OAGLibrary.zip
- OAGLibrary
  - bin
    - VC9
  - Common
    - include
    - src
  - IOSystem
    - include
      - Assert.h
      - Compiler
        
        poppack1.h
        
        pstdint.h
        
        pushpack1.h
      - DefaultIOSystem.h
      - fast_atof.h
      - FileSystemFilter.h
      - IOSystem.h
      - SystemTypes.h
    - src
      - DefaultIOSystem.cpp
  - ObjectMappingTable
    - include
      - FontMappingTable.h
      - TextureMappingTable.h
    - src
      - FontMappingTable.cpp
      - TextureMappingTable.cpp
  - OpenGL
    - include
      - BMP
        
        BMPTexture.h
      - Font
        
        OAGFont.h
        
        OAGFont2D.h
        
        OAGFontLoader.h
      - FontMapping
        
        OAGFontMapping.h
        
        OAGFontMapping2D.h
      - JPG
        
        jconfig.h
        
        jinclude.h
        
        jmorecfg.h
        
        jpeglib.h
        
        JPGTexture.h
      - oag
        
        Geometry.h
        
        OAGObject.h
        
        OAGPrimitives.h
        
        OAGScene.h
        
        OAGTree.h
        
        OAGXMLData.h
      - OAGXML
        
        OAGParser.h
      - PCX
        
        PCXTexture.h
      - Texture
        
        OAGTexture.h
        
        OAGTextureLoader.h
      - TextureMapping
        
        OAGRectangleMapping.h
        
        OAGTextureMapping.h
        
        OAGTriangleMapping.h
      - TGA
        
        TGATexture.h
      - WindowsViewer
        
        WinGraphicContext.h
        
        WinRenderer.h
    - src
      - BMP
        
        BMPTexture.cpp
      - Font
        
        OAGFont.cpp
        
        OAGFont2D.cpp
        
        OAGFontLoader.cpp
      - FontMapping
        
        OAGFontMapping.cpp
        
        OAGFontMapping2D.cpp
      - JPG
        
        JPGTexture.cpp
      - oag
        
        Geometry.cpp
        
        OAGObject.cpp
        
        OAGPrimitives.cpp
        
        OAGScene.cpp
      - OAGXML
        
        OAGParser.cpp
      - PCX
        
        PCXTexture.cpp
      - Texture
        
        OAGTexture.cpp
        
        OAGTextureLoader.cpp
      - TextureMapping
        
        OAGRectangleMapping.cpp
        
        OAGTextureMapping.cpp
        
        OAGTriangleMapping.cpp
      - TGA
        
        TGATexture.cpp
      - WindowsViewer
        
        WinGraphicContext.cpp
        
        WinRenderer.cpp
  - Workspaces
    - VC9
      - Common
        
        oagCommon
        
        Debug
        
        BuildLog.htm
        
        OAGCommon.sln
        
        OAGCommon.vcproj
        
        OAGCommon.vcproj.etucci-PC.etucci.user
        
        OAGCommon.vcproj.NTHOME.Tucci.user
        
        Release
        
        BuildLog.htm
      - IOSystem
        
        Debug
        
        BuildLog.htm
        
        IOSystem.sln
        
        IOSystem.suo
        
        IOSystem.vcproj
        
        IOSystem.vcproj.etucci-PC.etucci.user
        
        IOSystem.vcproj.NTHOME.Tucci.user
        
        Release
        
        BuildLog.htm
      - ObjectMappingTable
        
        Debug
        
        BuildLog.htm
        
        ObjectMappingTable.vcproj
        
        ObjectMappingTable.vcproj.etucci-PC.etucci.user
        
        ObjectMappingTable.vcproj.NTHOME.Tucci.user
        
        Release
        
        BuildLog.htm
      - OpenGL.sln
      - OpenGL.suo
      - OpenGL.vcproj
      - OpenGL
        
        BMP
        
        BMP.vcproj
        
        BMP.vcproj.etucci-PC.etucci.user
        
        BMP.vcproj.NTHOME.Tucci.user
        
        Debug
        
        BuildLog.htm
        
        Release
        
        BuildLog.htm
        
        Font
        
        Debug
        
        BuildLog.htm
        
        Font.sln
        
        Font.vcproj
        
        Font.vcproj.etucci-PC.etucci.user
        
        Font.vcproj.NTHOME.Tucci.user
        
        Release
        
        BuildLog.htm
        
        FontMapping
        
        Debug
        
        BuildLog.htm
        
        FontMapping.vcproj
        
        FontMapping.vcproj.NTHOME.Tucci.user
        
        Release
        
        BuildLog.htm
        
        JPG
        
        Debug
        
        BuildLog.htm
        
        JPG.vcproj
        
        JPG.vcproj.etucci-PC.etucci.user
        
        JPG.vcproj.NTHOME.Tucci.user
        
        Release
        
        BuildLog.htm
        
        oag
        
        Debug
        
        BuildLog.htm
        
        oag.sln
        
        oag.vcproj
        
        oag.vcproj.etucci-PC.etucci.user
        
        oag.vcproj.NTHOME.Tucci.user
        
        Release
        
        BuildLog.htm
        
        OAGXML
        
        Debug
        
        BuildLog.htm
        
        OAGXML.vcproj
        
        OAGXML.vcproj.etucci-PC.etucci.user
        
        OAGXML.vcproj.NTHOME.Tucci.user
        
        ReadMe.txt
        
        Release
        
        BuildLog.htm
        
        PCX
        
        Debug
        
        BuildLog.htm
        
        PCX.vcproj
        
        PCX.vcproj.etucci-PC.etucci.user
        
        PCX.vcproj.NTHOME.Tucci.user
        
        Release
        
        BuildLog.htm
        
        Texture
        
        Debug
        
        BuildLog.htm
        
        Release
        
        BuildLog.htm
        
        Texture.vcproj
        
        Texture.vcproj.etucci-PC.etucci.user
        
        Texture.vcproj.NTHOME.Tucci.user
        
        TextureMapping
        
        Debug
        
        BuildLog.htm
        
        Release
        
        BuildLog.htm
        
        TextureMapping.suo
        
        TextureMapping.vcproj
        
        TextureMapping.vcproj.etucci-PC.etucci.user
        
        TextureMapping.vcproj.NTHOME.Tucci.user
        
        TGA
        
        Debug
        
        BuildLog.htm
        
        Release
        
        BuildLog.htm
        
        TGA.sln
        
        TGA.vcproj
        
        TGA.vcproj.etucci-PC.etucci.user
        
        TGA.vcproj.NTHOME.Tucci.user
        
        WindowsViewer
        
        Debug
        
        BuildLog.htm
        
        Release
        
        BuildLog.htm
        
        WindowsViewer.sln
        
        WindowsViewer.vcproj
        
        WindowsViewer.vcproj.etucci-PC.etucci.user
        
        WindowsViewer.vcproj.NTHOME.Tucci.user
readme.zip
- readme.rtf
rendergraphicmfc.zip
- RenderGraphicMFC
  - RenderGraphicMFC.sln
  - RenderGraphicMFC
    - MainFrm.cpp
    - MainFrm.h
    - ReadMe.txt
    - RenderGraphicMFC.cpp
    - RenderGraphicMFC.h
    - RenderGraphicMFC.rc
    - RenderGraphicMFC.vcproj
    - RenderGraphicMFCDoc.cpp
    - RenderGraphicMFCDoc.h
    - RenderGraphicMFCView.cpp
    - RenderGraphicMFCView.h
    - res
      - RenderGraphicMFC.ico
      - RenderGraphicMFC.rc2
      - RenderGraphicMFCDoc.ico
      - Toolbar.bmp
      - Toolbar256.bmp
    - Resource.h
    - stdafx.cpp
    - stdafx.h
    - targetver.h
    - UserImages.bmp
freetype-2311.zip
- freetype-2.3.11
  - autogen.sh
  - builds
    - amiga
      - include
        
        freetype
        
        config
        
        ftconfig.h
        
        ftmodule.h
      - makefile
      - makefile.os4
      - README
      - smakefile
      - src
        
        base
        
        ftdebug.c
        
        ftsystem.c
    - ansi
      - ansi.mk
      - ansi-def.mk
    - atari
      - ATARI.H
      - deflinejoiner.awk
      - FNames.SIC
      - FREETYPE.PRJ
      - gen-purec-patch.sh
      - README.TXT
    - beos
      - beos.mk
      - beos-def.mk
      - detect.mk
    - compiler
      - ansi-cc.mk
      - bcc.mk
      - bcc-dev.mk
      - emx.mk
      - gcc.mk
      - gcc-dev.mk
      - intelc.mk
      - unix-lcc.mk
      - visualage.mk
      - visualc.mk
      - watcom.mk
      - win-lcc.mk
    - detect.mk
    - dos
      - detect.mk
      - dos-def.mk
      - dos-emx.mk
      - dos-gcc.mk
      - dos-wat.mk
    - exports.mk
    - freetype.mk
    - link_dos.mk
    - link_std.mk
    - mac
      - ascii2mpw.py
      - FreeType.m68k_cfm.make.txt
      - FreeType.m68k_far.make.txt
      - FreeType.ppc_carbon.make.txt
      - FreeType.ppc_classic.make.txt
      - ftlib.prj.xml
      - ftmac.c
      - README
    - modules.mk
    - newline
    - os2
      - detect.mk
      - os2-def.mk
      - os2-dev.mk
      - os2-gcc.mk
    - symbian
      - bld.inf
      - freetype.mmp
    - toplevel.mk
    - unix
      - aclocal.m4
      - config.guess
      - config.sub
      - configure
      - configure.ac
      - configure.raw
      - detect.mk
      - freetype2.in
      - freetype2.m4
      - freetype-config.in
      - ft2unix.h
      - ftconfig.in
      - ft-munmap.m4
      - ftsystem.c
      - install.mk
      - install-sh
      - ltmain.sh
      - mkinstalldirs
      - unix.mk
      - unix-cc.in
      - unixddef.mk
      - unix-def.in
      - unix-dev.mk
      - unix-lcc.mk
    - vms
      - ftconfig.h
      - ftsystem.c
    - win32
      - detect.mk
      - ftdebug.c
      - vc2005
        
        freetype.sln
        
        freetype.vcproj
        
        index.html
      - vc2008
        
        freetype.sln
        
        freetype.vcproj
        
        index.html
      - visualc
        
        freetype.dsp
        
        freetype.dsw
        
        freetype.sln
        
        freetype.vcproj
        
        index.html
      - visualce
        
        freetype.dsp
        
        freetype.dsw
        
        freetype.vcproj
        
        index.html
      - w32-bcc.mk
      - w32-bccd.mk
      - w32-dev.mk
      - w32-gcc.mk
      - w32-icc.mk
      - w32-intl.mk
      - w32-lcc.mk
      - w32-mingw32.mk
      - w32-vcc.mk
      - w32-wat.mk
      - win32-def.mk
    - wince
      - ftdebug.c
      - vc2005-ce
        
        freetype.sln
        
        freetype.vcproj
        
        index.html
      - vc2008-ce
        
        freetype.sln
        
        freetype.vcproj
        
        index.html
  - ChangeLog
  - ChangeLog.20
  - ChangeLog.21
  - ChangeLog.22
  - configure
  - devel
    - ft2build.h
    - ftoption.h
  - docs
    - CHANGES
    - CUSTOMIZE
    - DEBUG
    - formats.txt
    - FTL.TXT
    - GPL.TXT
    - INSTALL
    - INSTALL.ANY
    - INSTALL.CROSS
    - INSTALL.GNU
    - INSTALL.MAC
    - INSTALL.UNIX
    - INSTALL.VMS
    - LICENSE.TXT
    - MAKEPP
    - PATENTS
    - PROBLEMS
    - raster.txt
    - reference
      - ft2-base_interface.html
      - ft2-basic_types.html
      - ft2-bdf_fonts.html
      - ft2-bitmap_handling.html
      - ft2-cache_subsystem.html
      - ft2-cid_fonts.html
      - ft2-computations.html
      - ft2-font_formats.html
      - ft2-gasp_table.html
      - ft2-glyph_management.html
      - ft2-glyph_stroker.html
      - ft2-glyph_variants.html
      - ft2-gx_validation.html
      - ft2-gzip.html
      - ft2-header_file_macros.html
      - ft2-incremental.html
      - ft2-index.html
      - ft2-lcd_filtering.html
      - ft2-list_processing.html
      - ft2-lzw.html
      - ft2-mac_specific.html
      - ft2-module_management.html
      - ft2-multiple_masters.html
      - ft2-ot_validation.html
      - ft2-outline_processing.html
      - ft2-pfr_fonts.html
      - ft2-quick_advance.html
      - ft2-raster.html
      - ft2-sfnt_names.html
      - ft2-sizes_management.html
      - ft2-system_interface.html
      - ft2-toc.html
      - ft2-truetype_engine.html
      - ft2-truetype_tables.html
      - ft2-type1_tables.html
      - ft2-user_allocation.html
      - ft2-version.html
      - ft2-winfnt_fonts.html
      - README
    - release
    - TODO
    - TRUETYPE
    - UPGRADE.UNIX
    - VERSION.DLL
  - include
    - freetype
      - config
        
        ftconfig.h
        
        ftheader.h
        
        ftmodule.h
        
        ftoption.h
        
        ftstdlib.h
      - freetype.h
      - ftadvanc.h
      - ftbbox.h
      - ftbdf.h
      - ftbitmap.h
      - ftcache.h
      - ftchapters.h
      - ftcid.h
      - fterrdef.h
      - fterrors.h
      - ftgasp.h
      - ftglyph.h
      - ftgxval.h
      - ftgzip.h
      - ftimage.h
      - ftincrem.h
      - ftlcdfil.h
      - ftlist.h
      - ftlzw.h
      - ftmac.h
      - ftmm.h
      - ftmodapi.h
      - ftmoderr.h
      - ftotval.h
      - ftoutln.h
      - ftpfr.h
      - ftrender.h
      - ftsizes.h
      - ftsnames.h
      - ftstroke.h
      - ftsynth.h
      - ftsystem.h
      - fttrigon.h
      - fttypes.h
      - ftwinfnt.h
      - ftxf86.h
      - internal
        
        autohint.h
        
        ftcalc.h
        
        ftdebug.h
        
        ftdriver.h
        
        ftgloadr.h
        
        ftmemory.h
        
        ftobjs.h
        
        ftpic.h
        
        ftrfork.h
        
        ftserv.h
        
        ftstream.h
        
        fttrace.h
        
        ftvalid.h
        
        internal.h
        
        pcftypes.h
        
        psaux.h
        
        pshints.h
        
        services
        
        svbdf.h
        
        svcid.h
        
        svgldict.h
        
        svgxval.h
        
        svkern.h
        
        svmm.h
        
        svotval.h
        
        svpfr.h
        
        svpostnm.h
        
        svpscmap.h
        
        svpsinfo.h
        
        svsfnt.h
        
        svttcmap.h
        
        svtteng.h
        
        svttglyf.h
        
        svwinfnt.h
        
        svxf86nm.h
        
        sfnt.h
        
        t1types.h
        
        tttypes.h
      - t1tables.h
      - ttnameid.h
      - tttables.h
      - tttags.h
      - ttunpat.h
    - ft2build.h
  - Jamfile
  - Jamrules
  - Makefile
  - modules.cfg
  - objs
    - README
  - README
  - README.git
  - src
    - autofit
      - afangles.c
      - afangles.h
      - afcjk.c
      - afcjk.h
      - afdummy.c
      - afdummy.h
      - aferrors.h
      - afglobal.c
      - afglobal.h
      - afhints.c
      - afhints.h
      - afindic.c
      - afindic.h
      - aflatin.c
      - aflatin.h
      - aflatin2.c
      - aflatin2.h
      - afloader.c
      - afloader.h
      - afmodule.c
      - afmodule.h
      - afpic.c
      - afpic.h
      - aftypes.h
      - afwarp.c
      - afwarp.h
      - autofit.c
      - Jamfile
      - module.mk
      - rules.mk
    - base
      - basepic.c
      - basepic.h
      - ftadvanc.c
      - ftapi.c
      - ftbase.c
      - ftbase.h
      - ftbbox.c
      - ftbdf.c
      - ftbitmap.c
      - ftcalc.c
      - ftcid.c
      - ftdbgmem.c
      - ftdebug.c
      - ftfstype.c
      - ftgasp.c
      - ftgloadr.c
      - ftglyph.c
      - ftgxval.c
      - ftinit.c
      - ftlcdfil.c
      - ftmac.c
      - ftmm.c
      - ftobjs.c
      - ftotval.c
      - ftoutln.c
      - ftpatent.c
      - ftpfr.c
      - ftpic.c
      - ftrfork.c
      - ftsnames.c
      - ftstream.c
      - ftstroke.c
      - ftsynth.c
      - ftsystem.c
      - fttrigon.c
      - fttype1.c
      - ftutil.c
      - ftwinfnt.c
      - ftxf86.c
      - Jamfile
      - rules.mk
    - bdf
      - bdf.c
      - bdf.h
      - bdfdrivr.c
      - bdfdrivr.h
      - bdferror.h
      - bdflib.c
      - Jamfile
      - module.mk
      - README
      - rules.mk
    - cache
      - ftcache.c
      - ftcbasic.c
      - ftccache.c
      - ftccache.h
      - ftccback.h
      - ftccmap.c
      - ftcerror.h
      - ftcglyph.c
      - ftcglyph.h
      - ftcimage.c
      - ftcimage.h
      - ftcmanag.c
      - ftcmanag.h
      - ftcmru.c
      - ftcmru.h
      - ftcsbits.c
      - ftcsbits.h
      - Jamfile
      - rules.mk
    - cff
      - cff.c
      - cffcmap.c
      - cffcmap.h
      - cffdrivr.c
      - cffdrivr.h
      - cfferrs.h
      - cffgload.c
      - cffgload.h
      - cffload.c
      - cffload.h
      - cffobjs.c
      - cffobjs.h
      - cffparse.c
      - cffparse.h
      - cffpic.c
      - cffpic.h
      - cfftoken.h
      - cfftypes.h
      - Jamfile
      - module.mk
      - rules.mk
    - cid
      - ciderrs.h
      - cidgload.c
      - cidgload.h
      - cidload.c
      - cidload.h
      - cidobjs.c
      - cidobjs.h
      - cidparse.c
      - cidparse.h
      - cidriver.c
      - cidriver.h
      - cidtoken.h
      - Jamfile
      - module.mk
      - rules.mk
      - type1cid.c
    - gxvalid
      - gxvalid.c
      - gxvalid.h
      - gxvbsln.c
      - gxvcommn.c
      - gxvcommn.h
      - gxverror.h
      - gxvfeat.c
      - gxvfeat.h
      - gxvfgen.c
      - gxvjust.c
      - gxvkern.c
      - gxvlcar.c
      - gxvmod.c
      - gxvmod.h
      - gxvmort.c
      - gxvmort.h
      - gxvmort0.c
      - gxvmort1.c
      - gxvmort2.c
      - gxvmort4.c
      - gxvmort5.c
      - gxvmorx.c
      - gxvmorx.h
      - gxvmorx0.c
      - gxvmorx1.c
      - gxvmorx2.c
      - gxvmorx4.c
      - gxvmorx5.c
      - gxvopbd.c
      - gxvprop.c
      - gxvtrak.c
      - Jamfile
      - module.mk
      - README
      - rules.mk
    - gzip
      - adler32.c
      - ftgzip.c
      - infblock.c
      - infblock.h
      - infcodes.c
      - infcodes.h
      - inffixed.h
      - inflate.c
      - inftrees.c
      - inftrees.h
      - infutil.c
      - infutil.h
      - Jamfile
      - rules.mk
      - zconf.h
      - zlib.h
      - zutil.c
      - zutil.h
    - Jamfile
    - lzw
      - ftlzw.c
      - ftzopen.c
      - ftzopen.h
      - Jamfile
      - rules.mk
    - otvalid
      - Jamfile
      - module.mk
      - otvalid.c
      - otvalid.h
      - otvbase.c
      - otvcommn.c
      - otvcommn.h
      - otverror.h
      - otvgdef.c
      - otvgpos.c
      - otvgpos.h
      - otvgsub.c
      - otvjstf.c
      - otvmath.c
      - otvmod.c
      - otvmod.h
      - rules.mk
    - pcf
      - Jamfile
      - module.mk
      - pcf.c
      - pcf.h
      - pcfdrivr.c
      - pcfdrivr.h
      - pcferror.h
      - pcfread.c
      - pcfread.h
      - pcfutil.c
      - pcfutil.h
      - README
      - rules.mk
    - pfr
      - Jamfile
      - module.mk
      - pfr.c
      - pfrcmap.c
      - pfrcmap.h
      - pfrdrivr.c
      - pfrdrivr.h
      - pfrerror.h
      - pfrgload.c
      - pfrgload.h
      - pfrload.c
      - pfrload.h
      - pfrobjs.c
      - pfrobjs.h
      - pfrsbit.c
      - pfrsbit.h
      - pfrtypes.h
      - rules.mk
    - psaux
      - afmparse.c
      - afmparse.h
      - Jamfile
      - module.mk
      - psaux.c
      - psauxerr.h
      - psauxmod.c
      - psauxmod.h
      - psconv.c
      - psconv.h
      - psobjs.c
      - psobjs.h
      - rules.mk
      - t1cmap.c
      - t1cmap.h
      - t1decode.c
      - t1decode.h
    - pshinter
      - Jamfile
      - module.mk
      - pshalgo.c
      - pshalgo.h
      - pshglob.c
      - pshglob.h
      - pshinter.c
      - pshmod.c
      - pshmod.h
      - pshnterr.h
      - pshpic.c
      - pshpic.h
      - pshrec.c
      - pshrec.h
      - rules.mk
    - psnames
      - Jamfile
      - module.mk
      - psmodule.c
      - psmodule.h
      - psnamerr.h
      - psnames.c
      - pspic.c
      - pspic.h
      - pstables.h
      - rules.mk
    - raster
      - ftmisc.h
      - ftraster.c
      - ftraster.h
      - ftrend1.c
      - ftrend1.h
      - Jamfile
      - module.mk
      - raster.c
      - rasterrs.h
      - rastpic.c
      - rastpic.h
      - rules.mk
    - sfnt
      - Jamfile
      - module.mk
      - rules.mk
      - sfdriver.c
      - sfdriver.h
      - sferrors.h
      - sfnt.c
      - sfntpic.c
      - sfntpic.h
      - sfobjs.c
      - sfobjs.h
      - ttbdf.c
      - ttbdf.h
      - ttcmap.c
      - ttcmap.h
      - ttcmapc.h
      - ttkern.c
      - ttkern.h
      - ttload.c
      - ttload.h
      - ttmtx.c
      - ttmtx.h
      - ttpost.c
      - ttpost.h
      - ttsbit.c
      - ttsbit.h
      - ttsbit0.c
    - smooth
      - ftgrays.c
      - ftgrays.h
      - ftsmerrs.h
      - ftsmooth.c
      - ftsmooth.h
      - ftspic.c
      - ftspic.h
      - Jamfile
      - module.mk
      - rules.mk
      - smooth.c
    - tools
      - apinames.c
      - chktrcmp.py
      - cordic.py
      - docmaker
        
        content.py
        
        docbeauty.py
        
        docmaker.py
        
        formatter.py
        
        sources.py
        
        tohtml.py
        
        utils.py
      - ftrandom
        
        ftrandom.c
        
        Makefile
        
        README
      - glnames.py
      - Jamfile
      - test_afm.c
      - test_bbox.c
      - test_trig.c
    - truetype
      - Jamfile
      - module.mk
      - rules.mk
      - truetype.c
      - ttdriver.c
      - ttdriver.h
      - tterrors.h
      - ttgload.c
      - ttgload.h
      - ttgxvar.c
      - ttgxvar.h
      - ttinterp.c
      - ttinterp.h
      - ttobjs.c
      - ttobjs.h
      - ttpic.c
      - ttpic.h
      - ttpload.c
      - ttpload.h
    - type1
      - Jamfile
      - module.mk
      - rules.mk
      - t1afm.c
      - t1afm.h
      - t1driver.c
      - t1driver.h
      - t1errors.h
      - t1gload.c
      - t1gload.h
      - t1load.c
      - t1load.h
      - t1objs.c
      - t1objs.h
      - t1parse.c
      - t1parse.h
      - t1tokens.h
      - type1.c
    - type42
      - Jamfile
      - module.mk
      - rules.mk
      - t42drivr.c
      - t42drivr.h
      - t42error.h
      - t42objs.c
      - t42objs.h
      - t42parse.c
      - t42parse.h
      - t42types.h
      - type42.c
    - winfonts
      - fnterrs.h
      - Jamfile
      - module.mk
      - rules.mk
      - winfnt.c
      - winfnt.h
  - version.sed
  - vms_make.com
OAGXML_DOCUMENTATION.zip
- OAGXML_DOCUMENTATION.rtf
OAG_2D_demo.zip
- OAGMFC.exe
OAG_2D_SRC.zip
- OAGMFC
  - Debug
    - UserImages.bmp
  - OAGMFC.sln
  - OAGMFC.suo
  - OAGMFC
    - CadTools
      - LineTool.cpp
      - LineTool.h
      - PolylineTool.cpp
      - PolylineTool.h
      - RasterImageRectTool.cpp
      - RasterImageRectTool.h
      - RasterImageTool.cpp
      - RasterImageTool.h
      - RasterImageTriangleTool.cpp
      - RasterImageTriangleTool.h
      - RectangleTool.cpp
      - RectangleTool.h
      - Text2DTool.cpp
      - Text2DTool.h
      - Tool.cpp
      - Tool.h
      - TriangleTool.cpp
      - TriangleTool.h
    - Debug
      - BuildLog.htm
    - MainFrm.cpp
    - MainFrm.h
    - OAGMFC.APS
    - OAGMFC.cpp
    - OAGMFC.h
    - OAGMFC.rc
    - OAGMFC.vcproj
    - OAGMFC.vcproj.NTHOME.Tucci.user
    - OAGMFCDoc.cpp
    - OAGMFCDoc.h
    - OAGMFCView.cpp
    - OAGMFCView.h
    - ReadMe.txt
    - Release
      - BuildLog.htm
    - RenderGraphicMFC.aps
    - res
      - OAGMFC.ico
      - OAGMFC.rc2
      - OAGMFCDoc.ico
      - Toolbar.bmp
      - Toolbar256.bmp
    - resource.h
    - stdafx.cpp
    - stdafx.h
    - targetver.h
    - UserImages.bmp
  - Release
    - UserImages.bmp
OAG_Demo.zip
- OAG_Demo
  - RenderGraphicMFC.exe
WinAPIs.zip
- WinAPIs
  - FTGL-2.1.3rc5
    - bin
      - Debug
        
        BuildLog.htm
      - FTGL-2.1.3rc5.sln
      - FTGL-2.1.3rc5.vcproj
      - Release
        
        BuildLog.htm
    - config.h
    - FTCharmap.cpp
    - FTCharmap.h
    - FTCharToGlyphIndexMap.h
    - FTContour.cpp
    - FTContour.h
    - FTFace.cpp
    - FTFace.h
    - FTGlyphContainer.cpp
    - FTGlyphContainer.h
    - FTLibrary.cpp
    - FTLibrary.h
    - FTList.h
    - FTVectoriser.cpp
    - FTVectoriser.h
    - include
    - src
      - FTGL
        
        FTBitmapFont.cpp
        
        FTBuffer.cpp
        
        FTBufferFont.cpp
        
        FTExtrudeFont.cpp
        
        FTFace.cpp
        
        FTFont.cpp
        
        FTFontGlue.cpp
        
        FTGlyphContainer.cpp
        
        FTOutlineFont.cpp
        
        FTPixmapFont.cpp
        
        FTPoint.cpp
        
        FTPolygonFont.cpp
        
        FTSize.cpp
        
        FTTextureFont.cpp
      - FTGlyph
        
        FTBitmapGlyph.cpp
        
        FTBufferGlyph.cpp
        
        FTExtrudeGlyph.cpp
        
        FTGlyph.cpp
        
        FTGlyphGlue.cpp
        
        FTOutlineGlyph.cpp
        
        FTPixmapGlyph.cpp
        
        FTPolygonGlyph.cpp
        
        FTTextureGlyph.cpp
      - FTLayout
        
        FTLayout.cpp
        
        FTLayoutGlue.cpp
        
        FTSimpleLayout.cpp
  - LibJPEG
    - Debug
      - BuildLog.htm
    - files
      - ansi2knr.c
      - cderror.h
      - cdjpeg.c
      - cdjpeg.h
      - cjpeg.c
      - ckconfig.c
      - djpeg.c
      - jaricom.c
      - jcapimin.c
      - jcapistd.c
      - jcarith.c
      - jccoefct.c
      - jccolor.c
      - jcdctmgr.c
      - jchuff.c
      - jcinit.c
      - jcmainct.c
      - jcmarker.c
      - jcmaster.c
      - jcomapi.c
      - jconfig.h
      - jcparam.c
      - jcprepct.c
      - jcsample.c
      - jctrans.c
      - jdapimin.c
      - jdapistd.c
      - jdarith.c
      - jdatadst.c
      - jdatasrc.c
      - jdcoefct.c
      - jdcolor.c
      - jdct.h
      - jddctmgr.c
      - jdhuff.c
      - jdinput.c
      - jdmainct.c
      - jdmarker.c
      - jdmaster.c
      - jdmerge.c
      - jdpostct.c
      - jdsample.c
      - jdtrans.c
      - jerror.c
      - jerror.h
      - jfdctflt.c
      - jfdctfst.c
      - jfdctint.c
      - jidctflt.c
      - jidctfst.c
      - jidctint.c
      - jinclude.h
      - jmemdosa.asm
      - jmemmgr.c
      - jmemnobs.c
      - jmemsys.h
      - jmorecfg.h
      - jpegint.h
      - jpeglib.h
      - jpegtran.c
      - jquant1.c
      - jquant2.c
      - jutils.c
      - jversion.h
      - libjpeg.map
      - rdbmp.c
      - rdcolmap.c
      - rdgif.c
      - rdjpgcom.c
      - rdppm.c
      - rdrle.c
      - rdswitch.c
      - rdtarga.c
      - transupp.c
      - transupp.h
      - wrbmp.c
      - wrgif.c
      - wrjpgcom.c
      - wrppm.c
      - wrrle.c
      - wrtarga.c
    - LibJPEG.sln
    - LibJPEG.vcproj
    - Release
      - BuildLog.htm
  - LibXML
    - acconfig.h
    - bin
      - Debug Win32
        
        LibXML
        
        Intermediate
        
        BuildLog.htm
      - Release Win32
        
        LibXML
        
        Intermediate
        
        BuildLog.htm
    - c14n.c
    - catalog.c
    - chvalid.c
    - Compatibility.txt
    - config.h
    - config-ps3.h
    - COPYING
    - Copyright
    - debugXML.c
    - dict.c
    - DOCBparser.c
    - elfgcchack.h
    - encoding.c
    - entities.c
    - error.c
    - globals.c
    - hash.c
    - HTMLparser.c
    - HTMLtree.c
    - include
      - libxml
        
        c14n.h
        
        catalog.h
        
        chvalid.h
        
        debugXML.h
        
        dict.h
        
        DOCBparser.h
        
        encoding.h
        
        entities.h
        
        globals.h
        
        hash.h
        
        HTMLparser.h
        
        HTMLtree.h
        
        list.h
        
        nanoftp.h
        
        nanohttp.h
        
        parser.h
        
        parserInternals.h
        
        pattern.h
        
        relaxng.h
        
        SAX.h
        
        SAX2.h
        
        schemasInternals.h
        
        threads.h
        
        tree.h
        
        uri.h
        
        valid.h
        
        xinclude.h
        
        xlink.h
        
        xmlautomata.h
        
        xmlerror.h
        
        xmlexports.h
        
        xmlIO.h
        
        xmlmemory.h
        
        xmlmodule.h
        
        xmlreader.h
        
        xmlregexp.h
        
        xmlsave.h
        
        xmlschemas.h
        
        xmlschemastypes.h
        
        xmlstring.h
        
        xmlunicode.h
        
        xmlversion.h
        
        xmlwriter.h
        
        xpath.h
        
        xpathInternals.h
        
        xpointer.h
      - win32config.h
      - wsockcompat.h
    - legacy.c
    - libxml.h
    - LibXML.sln
    - LibXML.vcproj
    - LibXML.vcproj.8.00.old
    - list.c
    - nanoftp.c
    - nanohttp.c
    - parser.c
    - parserInternals.c
    - pattern.c
    - relaxng.c
    - SAX.c
    - SAX2.c
    - threads.c
    - tree.c
    - uri.c
    - valid.c
    - xinclude.c
    - xlink.c
    - xmlcatalog.c
    - xmlIO.c
    - xmlmemory.c
    - xmlmodule.c
    - xmlreader.c
    - xmlregexp.c
    - xmlsave.c
    - xmlstring.c
    - xmlunicode.c
    - xmlwriter.c
  - XmlParserInterface
    - Debug
      - BuildLog.htm
    - Release
      - BuildLog.htm
    - XmlAttribute.h
    - XmlDocument.cpp
    - XmlDocument.h
    - XmlNode.cpp
    - XmlNode.h
    - XmlNodeList.h
    - XmlParserInterface.sln
    - XmlParserInterface.vcproj
    - XmlWriterDocument.cpp
    - XmlWriterDocument.h
xmlsamples.zip
- xmlsample
  - 2DTree.bmp
  - arial.ttf
  - Building2D.oagxml
  - fontMapping.oagxml
  - honda-pcx.jpg
  - line.oagxml
  - line2.oagxml
  - Polyline.oagxml
  - Rect.oagxml
  - shrub.jpg
  - TextureAndText.oagxml
  - Triangle.oagxml

/*
 * jfdctint.c
 *
 * Copyright (C) 1991-1996, Thomas G. Lane.
 * Modification developed 2003-2009 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
 * This file contains a slow-but-accurate integer implementation of the
 * forward DCT (Discrete Cosine Transform).
 *
 * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
 * on each column.  Direct algorithms are also available, but they are
 * much more complex and seem not to be any faster when reduced to code.
 *
 * This implementation is based on an algorithm described in
 *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
 *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
 *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
 * The primary algorithm described there uses 11 multiplies and 29 adds.
 * We use their alternate method with 12 multiplies and 32 adds.
 * The advantage of this method is that no data path contains more than one
 * multiplication; this allows a very simple and accurate implementation in
 * scaled fixed-point arithmetic, with a minimal number of shifts.
 *
 * We also provide FDCT routines with various input sample block sizes for
 * direct resolution reduction or enlargement and for direct resolving the
 * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
 * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block.
 *
 * For N<8 we fill the remaining block coefficients with zero.
 * For N>8 we apply a partial N-point FDCT on the input samples, computing
 * just the lower 8 frequency coefficients and discarding the rest.
 *
 * We must scale the output coefficients of the N-point FDCT appropriately
 * to the standard 8-point FDCT level by 8/N per 1-D pass.  This scaling
 * is folded into the constant multipliers (pass 2) and/or final/initial
 * shifting.
 *
 * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
 * since there would be too many additional constants to pre-calculate.
 */

#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
#include "jdct.h"		/* Private declarations for DCT subsystem */

#ifdef DCT_ISLOW_SUPPORTED


/*
 * This module is specialized to the case DCTSIZE = 8.
 */

#if DCTSIZE != 8
  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
#endif


/*
 * The poop on this scaling stuff is as follows:
 *
 * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
 * larger than the true DCT outputs.  The final outputs are therefore
 * a factor of N larger than desired; since N=8 this can be cured by
 * a simple right shift at the end of the algorithm.  The advantage of
 * this arrangement is that we save two multiplications per 1-D DCT,
 * because the y0 and y4 outputs need not be divided by sqrt(N).
 * In the IJG code, this factor of 8 is removed by the quantization step
 * (in jcdctmgr.c), NOT in this module.
 *
 * We have to do addition and subtraction of the integer inputs, which
 * is no problem, and multiplication by fractional constants, which is
 * a problem to do in integer arithmetic.  We multiply all the constants
 * by CONST_SCALE and convert them to integer constants (thus retaining
 * CONST_BITS bits of precision in the constants).  After doing a
 * multiplication we have to divide the product by CONST_SCALE, with proper
 * rounding, to produce the correct output.  This division can be done
 * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
 * as long as possible so that partial sums can be added together with
 * full fractional precision.
 *
 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
 * they are represented to better-than-integral precision.  These outputs
 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
 * with the recommended scaling.  (For 12-bit sample data, the intermediate
 * array is INT32 anyway.)
 *
 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
 * shows that the values given below are the most effective.
 */

#if BITS_IN_JSAMPLE == 8
#define CONST_BITS  13
#define PASS1_BITS  2
#else
#define CONST_BITS  13
#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
#endif

/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
 * causing a lot of useless floating-point operations at run time.
 * To get around this we use the following pre-calculated constants.
 * If you change CONST_BITS you may want to add appropriate values.
 * (With a reasonable C compiler, you can just rely on the FIX() macro...)
 */

#if CONST_BITS == 13
#define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
#define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
#define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */
#define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
#define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
#define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */
#define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */
#define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
#define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */
#define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */
#define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
#define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */
#else
#define FIX_0_298631336  FIX(0.298631336)
#define FIX_0_390180644  FIX(0.390180644)
#define FIX_0_541196100  FIX(0.541196100)
#define FIX_0_765366865  FIX(0.765366865)
#define FIX_0_899976223  FIX(0.899976223)
#define FIX_1_175875602  FIX(1.175875602)
#define FIX_1_501321110  FIX(1.501321110)
#define FIX_1_847759065  FIX(1.847759065)
#define FIX_1_961570560  FIX(1.961570560)
#define FIX_2_053119869  FIX(2.053119869)
#define FIX_2_562915447  FIX(2.562915447)
#define FIX_3_072711026  FIX(3.072711026)
#endif


/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
 * For 8-bit samples with the recommended scaling, all the variable
 * and constant values involved are no more than 16 bits wide, so a
 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
 * For 12-bit samples, a full 32-bit multiplication will be needed.
 */

#if BITS_IN_JSAMPLE == 8
#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
#else
#define MULTIPLY(var,const)  ((var) * (const))
#endif


/*
 * Perform the forward DCT on one block of samples.
 */

GLOBAL(void)
jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3;
  INT32 tmp10, tmp11, tmp12, tmp13;
  INT32 z1;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */

  dataptr = data;
  for (ctr = 0; ctr < DCTSIZE; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part per LL&M figure 1 --- note that published figure is faulty;
     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
     */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);

    tmp10 = tmp0 + tmp3;
    tmp12 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp13 = tmp1 - tmp2;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);

    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
    /* Add fudge factor here for final descale. */
    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
    dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
				       CONST_BITS-PASS1_BITS);
    dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
				       CONST_BITS-PASS1_BITS);

    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
     * cK represents sqrt(2) * cos(K*pi/16).
     * i0..i3 in the paper are tmp0..tmp3 here.
     */

    tmp10 = tmp0 + tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp0 + tmp2;
    tmp13 = tmp1 + tmp3;
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
    /* Add fudge factor here for final descale. */
    z1 += ONE << (CONST_BITS-PASS1_BITS-1);

    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */

    tmp12 += z1;
    tmp13 += z1;

    dataptr[1] = (DCTELEM)
      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
    dataptr[3] = (DCTELEM)
      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
    dataptr[5] = (DCTELEM)
      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
    dataptr[7] = (DCTELEM)
      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   */

  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
     */

    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];

    /* Add fudge factor here for final descale. */
    tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
    tmp12 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp13 = tmp1 - tmp2;

    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];

    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
    dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);

    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
    /* Add fudge factor here for final descale. */
    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
    dataptr[DCTSIZE*2] = (DCTELEM)
      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*6] = (DCTELEM)
      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);

    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
     * cK represents sqrt(2) * cos(K*pi/16).
     * i0..i3 in the paper are tmp0..tmp3 here.
     */

    tmp10 = tmp0 + tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp0 + tmp2;
    tmp13 = tmp1 + tmp3;
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
    /* Add fudge factor here for final descale. */
    z1 += ONE << (CONST_BITS+PASS1_BITS-1);

    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */

    tmp12 += z1;
    tmp13 += z1;

    dataptr[DCTSIZE*1] = (DCTELEM)
      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*3] = (DCTELEM)
      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*5] = (DCTELEM)
      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*7] = (DCTELEM)
      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);

    dataptr++;			/* advance pointer to next column */
  }
}

#ifdef DCT_SCALING_SUPPORTED


/*
 * Perform the forward DCT on a 7x7 sample block.
 */

GLOBAL(void)
jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3;
  INT32 tmp10, tmp11, tmp12;
  INT32 z1, z2, z3;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* cK represents sqrt(2) * cos(K*pi/14). */

  dataptr = data;
  for (ctr = 0; ctr < 7; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
    tmp3 = GETJSAMPLE(elemptr[3]);

    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);

    z1 = tmp0 + tmp2;
    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
    tmp3 += tmp3;
    z1 -= tmp3;
    z1 -= tmp3;
    z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
    z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
    dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
    z1 -= z2;
    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
    dataptr[4] = (DCTELEM)
      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
	      CONST_BITS-PASS1_BITS);
    dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);

    /* Odd part */

    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
    tmp0 = tmp1 - tmp2;
    tmp1 += tmp2;
    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
    tmp1 += tmp2;
    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
    tmp0 += tmp3;
    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */

    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * We must also scale the output by (8/7)**2 = 64/49, which we fold
   * into the constant multipliers:
   * cK now represents sqrt(2) * cos(K*pi/14) * 64/49.
   */

  dataptr = data;
  for (ctr = 0; ctr < 7; ctr++) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
    tmp3 = dataptr[DCTSIZE*3];

    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];

    z1 = tmp0 + tmp2;
    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
	      CONST_BITS+PASS1_BITS);
    tmp3 += tmp3;
    z1 -= tmp3;
    z1 -= tmp3;
    z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
    z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS);
    z1 -= z2;
    z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
    dataptr[DCTSIZE*4] = (DCTELEM)
      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS);

    /* Odd part */

    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
    tmp0 = tmp1 - tmp2;
    tmp1 += tmp2;
    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
    tmp1 += tmp2;
    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
    tmp0 += tmp3;
    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */

    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS);

    dataptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 6x6 sample block.
 */

GLOBAL(void)
jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2;
  INT32 tmp10, tmp11, tmp12;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* cK represents sqrt(2) * cos(K*pi/12). */

  dataptr = data;
  for (ctr = 0; ctr < 6; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);

    tmp10 = tmp0 + tmp2;
    tmp12 = tmp0 - tmp2;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
    dataptr[2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
	      CONST_BITS-PASS1_BITS);
    dataptr[4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
	      CONST_BITS-PASS1_BITS);

    /* Odd part */

    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
		    CONST_BITS-PASS1_BITS);

    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * We must also scale the output by (8/6)**2 = 16/9, which we fold
   * into the constant multipliers:
   * cK now represents sqrt(2) * cos(K*pi/12) * 16/9.
   */

  dataptr = data;
  for (ctr = 0; ctr < 6; ctr++) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];

    tmp10 = tmp0 + tmp2;
    tmp12 = tmp0 - tmp2;

    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
	      CONST_BITS+PASS1_BITS);

    /* Odd part */

    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */

    dataptr[DCTSIZE*1] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*3] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*5] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
	      CONST_BITS+PASS1_BITS);

    dataptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 5x5 sample block.
 */

GLOBAL(void)
jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2;
  INT32 tmp10, tmp11;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* We scale the results further by 2 as part of output adaption */
  /* scaling for different DCT size. */
  /* cK represents sqrt(2) * cos(K*pi/10). */

  dataptr = data;
  for (ctr = 0; ctr < 5; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
    tmp2 = GETJSAMPLE(elemptr[2]);

    tmp10 = tmp0 + tmp1;
    tmp11 = tmp0 - tmp1;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
    tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
    tmp10 -= tmp2 << 2;
    tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
    dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
    dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);

    /* Odd part */

    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */

    dataptr[1] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
	      CONST_BITS-PASS1_BITS-1);
    dataptr[3] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
	      CONST_BITS-PASS1_BITS-1);

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * We must also scale the output by (8/5)**2 = 64/25, which we partially
   * fold into the constant multipliers (other part was done in pass 1):
   * cK now represents sqrt(2) * cos(K*pi/10) * 32/25.
   */

  dataptr = data;
  for (ctr = 0; ctr < 5; ctr++) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
    tmp2 = dataptr[DCTSIZE*2];

    tmp10 = tmp0 + tmp1;
    tmp11 = tmp0 - tmp1;

    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
	      CONST_BITS+PASS1_BITS);
    tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
    tmp10 -= tmp2 << 2;
    tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);

    /* Odd part */

    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */

    dataptr[DCTSIZE*1] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*3] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
	      CONST_BITS+PASS1_BITS);

    dataptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 4x4 sample block.
 */

GLOBAL(void)
jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1;
  INT32 tmp10, tmp11;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* We must also scale the output by (8/4)**2 = 2**2, which we add here. */
  /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */

  dataptr = data;
  for (ctr = 0; ctr < 4; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);

    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));

    /* Odd part */

    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
    /* Add fudge factor here for final descale. */
    tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);

    dataptr[1] = (DCTELEM)
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
		  CONST_BITS-PASS1_BITS-2);
    dataptr[3] = (DCTELEM)
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
		  CONST_BITS-PASS1_BITS-2);

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   */

  dataptr = data;
  for (ctr = 0; ctr < 4; ctr++) {
    /* Even part */

    /* Add fudge factor here for final descale. */
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];

    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];

    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
    dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);

    /* Odd part */

    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
    /* Add fudge factor here for final descale. */
    tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);

    dataptr[DCTSIZE*1] = (DCTELEM)
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
		  CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*3] = (DCTELEM)
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
		  CONST_BITS+PASS1_BITS);

    dataptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 3x3 sample block.
 */

GLOBAL(void)
jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* We scale the results further by 2**2 as part of output adaption */
  /* scaling for different DCT size. */
  /* cK represents sqrt(2) * cos(K*pi/6). */

  dataptr = data;
  for (ctr = 0; ctr < 3; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
    tmp1 = GETJSAMPLE(elemptr[1]);

    tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
    dataptr[2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
	      CONST_BITS-PASS1_BITS-2);

    /* Odd part */

    dataptr[1] = (DCTELEM)
      DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
	      CONST_BITS-PASS1_BITS-2);

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * We must also scale the output by (8/3)**2 = 64/9, which we partially
   * fold into the constant multipliers (other part was done in pass 1):
   * cK now represents sqrt(2) * cos(K*pi/6) * 16/9.
   */

  dataptr = data;
  for (ctr = 0; ctr < 3; ctr++) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
    tmp1 = dataptr[DCTSIZE*1];

    tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
	      CONST_BITS+PASS1_BITS);

    /* Odd part */

    dataptr[DCTSIZE*1] = (DCTELEM)
      DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
	      CONST_BITS+PASS1_BITS);

    dataptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 2x2 sample block.
 */

GLOBAL(void)
jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3;
  JSAMPROW elemptr;

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT. */

  /* Row 0 */
  elemptr = sample_data[0] + start_col;

  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
  tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);

  /* Row 1 */
  elemptr = sample_data[1] + start_col;

  tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
  tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);

  /* Pass 2: process columns.
   * We leave the results scaled up by an overall factor of 8.
   * We must also scale the output by (8/2)**2 = 2**4.
   */

  /* Column 0 */
  /* Apply unsigned->signed conversion */
  data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4);
  data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp2) << 4);

  /* Column 1 */
  data[DCTSIZE*0+1] = (DCTELEM) ((tmp1 + tmp3) << 4);
  data[DCTSIZE*1+1] = (DCTELEM) ((tmp1 - tmp3) << 4);
}


/*
 * Perform the forward DCT on a 1x1 sample block.
 */

GLOBAL(void)
jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  /* We leave the result scaled up by an overall factor of 8. */
  /* We must also scale the output by (8/1)**2 = 2**6. */
  /* Apply unsigned->signed conversion */
  data[0] = (DCTELEM)
    ((GETJSAMPLE(sample_data[0][start_col]) - CENTERJSAMPLE) << 6);
}


/*
 * Perform the forward DCT on a 9x9 sample block.
 */

GLOBAL(void)
jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
  INT32 tmp10, tmp11, tmp12, tmp13;
  INT32 z1, z2;
  DCTELEM workspace[8];
  DCTELEM *dataptr;
  DCTELEM *wsptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* we scale the results further by 2 as part of output adaption */
  /* scaling for different DCT size. */
  /* cK represents sqrt(2) * cos(K*pi/18). */

  dataptr = data;
  ctr = 0;
  for (;;) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
    tmp4 = GETJSAMPLE(elemptr[4]);

    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);

    z1 = tmp0 + tmp2 + tmp3;
    z2 = tmp1 + tmp4;
    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
    dataptr[6] = (DCTELEM)
      DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)),  /* c6 */
	      CONST_BITS-1);
    z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049));        /* c2 */
    z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
    dataptr[2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441))    /* c4 */
	      + z1 + z2, CONST_BITS-1);
    dataptr[4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608))    /* c8 */
	      + z1 - z2, CONST_BITS-1);

    /* Odd part */

    dataptr[3] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
	      CONST_BITS-1);

    tmp11 = MULTIPLY(tmp11, FIX(1.224744871));        /* c3 */
    tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
    tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */

    dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-1);

    tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */

    dataptr[5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-1);
    dataptr[7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-1);

    ctr++;

    if (ctr != DCTSIZE) {
      if (ctr == 9)
	break;			/* Done. */
      dataptr += DCTSIZE;	/* advance pointer to next row */
    } else
      dataptr = workspace;	/* switch pointer to extended workspace */
  }

  /* Pass 2: process columns.
   * We leave the results scaled up by an overall factor of 8.
   * We must also scale the output by (8/9)**2 = 64/81, which we partially
   * fold into the constant multipliers and final/initial shifting:
   * cK now represents sqrt(2) * cos(K*pi/18) * 128/81.
   */

  dataptr = data;
  wsptr = workspace;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*0];
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*7];
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*6];
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*5];
    tmp4 = dataptr[DCTSIZE*4];

    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*0];
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*7];
    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*6];
    tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*5];

    z1 = tmp0 + tmp2 + tmp3;
    z2 = tmp1 + tmp4;
    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)),       /* 128/81 */
	      CONST_BITS+2);
    dataptr[DCTSIZE*6] = (DCTELEM)
      DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)),  /* c6 */
	      CONST_BITS+2);
    z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287));        /* c2 */
    z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190))    /* c4 */
	      + z1 + z2, CONST_BITS+2);
    dataptr[DCTSIZE*4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096))    /* c8 */
	      + z1 - z2, CONST_BITS+2);

    /* Odd part */

    dataptr[DCTSIZE*3] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
	      CONST_BITS+2);

    tmp11 = MULTIPLY(tmp11, FIX(1.935399303));        /* c3 */
    tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
    tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */

    dataptr[DCTSIZE*1] = (DCTELEM)
      DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+2);

    tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */

    dataptr[DCTSIZE*5] = (DCTELEM)
      DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+2);
    dataptr[DCTSIZE*7] = (DCTELEM)
      DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+2);

    dataptr++;			/* advance pointer to next column */
    wsptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 10x10 sample block.
 */

GLOBAL(void)
jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
  DCTELEM workspace[8*2];
  DCTELEM *dataptr;
  DCTELEM *wsptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* we scale the results further by 2 as part of output adaption */
  /* scaling for different DCT size. */
  /* cK represents sqrt(2) * cos(K*pi/20). */

  dataptr = data;
  ctr = 0;
  for (;;) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
    tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);

    tmp10 = tmp0 + tmp4;
    tmp13 = tmp0 - tmp4;
    tmp11 = tmp1 + tmp3;
    tmp14 = tmp1 - tmp3;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
    tmp12 += tmp12;
    dataptr[4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
	      MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
	      CONST_BITS-1);
    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
    dataptr[2] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
	      CONST_BITS-1);
    dataptr[6] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
	      CONST_BITS-1);

    /* Odd part */

    tmp10 = tmp0 + tmp4;
    tmp11 = tmp1 - tmp3;
    dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);
    tmp2 <<= CONST_BITS;
    dataptr[1] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
	      MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
	      MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
	      MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
	      CONST_BITS-1);
    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
	    MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
	    (tmp11 << (CONST_BITS - 1)) - tmp2;
    dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-1);
    dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-1);

    ctr++;

    if (ctr != DCTSIZE) {
      if (ctr == 10)
	break;			/* Done. */
      dataptr += DCTSIZE;	/* advance pointer to next row */
    } else
      dataptr = workspace;	/* switch pointer to extended workspace */
  }

  /* Pass 2: process columns.
   * We leave the results scaled up by an overall factor of 8.
   * We must also scale the output by (8/10)**2 = 16/25, which we partially
   * fold into the constant multipliers and final/initial shifting:
   * cK now represents sqrt(2) * cos(K*pi/20) * 32/25.
   */

  dataptr = data;
  wsptr = workspace;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
    tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];

    tmp10 = tmp0 + tmp4;
    tmp13 = tmp0 - tmp4;
    tmp11 = tmp1 + tmp3;
    tmp14 = tmp1 - tmp3;

    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
	      CONST_BITS+2);
    tmp12 += tmp12;
    dataptr[DCTSIZE*4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
	      MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
	      CONST_BITS+2);
    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
	      CONST_BITS+2);
    dataptr[DCTSIZE*6] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
	      CONST_BITS+2);

    /* Odd part */

    tmp10 = tmp0 + tmp4;
    tmp11 = tmp1 - tmp3;
    dataptr[DCTSIZE*5] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
	      CONST_BITS+2);
    tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
    dataptr[DCTSIZE*1] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
	      MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
	      MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
	      MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
	      CONST_BITS+2);
    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
	    MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
	    MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+2);
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+2);

    dataptr++;			/* advance pointer to next column */
    wsptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on an 11x11 sample block.
 */

GLOBAL(void)
jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
  INT32 z1, z2, z3;
  DCTELEM workspace[8*3];
  DCTELEM *dataptr;
  DCTELEM *wsptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* we scale the results further by 2 as part of output adaption */
  /* scaling for different DCT size. */
  /* cK represents sqrt(2) * cos(K*pi/22). */

  dataptr = data;
  ctr = 0;
  for (;;) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
    tmp5 = GETJSAMPLE(elemptr[5]);

    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
    tmp5 += tmp5;
    tmp0 -= tmp5;
    tmp1 -= tmp5;
    tmp2 -= tmp5;
    tmp3 -= tmp5;
    tmp4 -= tmp5;
    z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) +       /* c2 */
	 MULTIPLY(tmp2 + tmp4, FIX(0.201263574));        /* c10 */
    z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931));        /* c6 */
    z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156));        /* c4 */
    dataptr[2] = (DCTELEM)
      DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
	      - MULTIPLY(tmp4, FIX(1.390975730)),        /* c4+c10 */
	      CONST_BITS-1);
    dataptr[4] = (DCTELEM)
      DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
	      - MULTIPLY(tmp2, FIX(1.356927976))         /* c2 */
	      + MULTIPLY(tmp4, FIX(0.587485545)),        /* c8 */
	      CONST_BITS-1);
    dataptr[6] = (DCTELEM)
      DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
	      - MULTIPLY(tmp2, FIX(0.788749120)),        /* c8+c10 */
	      CONST_BITS-1);

    /* Odd part */

    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905));    /* c3 */
    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298));    /* c5 */
    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576));    /* c7 */
    tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
	   + MULTIPLY(tmp14, FIX(0.398430003));          /* c9 */
    tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576));  /* -c7 */
    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907));  /* -c1 */
    tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
	    - MULTIPLY(tmp14, FIX(1.068791298));         /* c5 */
    tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003));   /* c9 */
    tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
	    + MULTIPLY(tmp14, FIX(1.399818907));         /* c1 */
    tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
	    - MULTIPLY(tmp14, FIX(1.286413905));         /* c3 */

    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-1);
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-1);
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-1);
    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-1);

    ctr++;

    if (ctr != DCTSIZE) {
      if (ctr == 11)
	break;			/* Done. */
      dataptr += DCTSIZE;	/* advance pointer to next row */
    } else
      dataptr = workspace;	/* switch pointer to extended workspace */
  }

  /* Pass 2: process columns.
   * We leave the results scaled up by an overall factor of 8.
   * We must also scale the output by (8/11)**2 = 64/121, which we partially
   * fold into the constant multipliers and final/initial shifting:
   * cK now represents sqrt(2) * cos(K*pi/22) * 128/121.
   */

  dataptr = data;
  wsptr = workspace;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*2];
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*1];
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*0];
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*7];
    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*6];
    tmp5 = dataptr[DCTSIZE*5];

    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*2];
    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*1];
    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*0];
    tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*7];
    tmp14 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*6];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
		       FIX(1.057851240)),                /* 128/121 */
	      CONST_BITS+2);
    tmp5 += tmp5;
    tmp0 -= tmp5;
    tmp1 -= tmp5;
    tmp2 -= tmp5;
    tmp3 -= tmp5;
    tmp4 -= tmp5;
    z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) +       /* c2 */
	 MULTIPLY(tmp2 + tmp4, FIX(0.212906922));        /* c10 */
    z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713));        /* c6 */
    z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479));        /* c4 */
    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
	      - MULTIPLY(tmp4, FIX(1.471445400)),        /* c4+c10 */
	      CONST_BITS+2);
    dataptr[DCTSIZE*4] = (DCTELEM)
      DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
	      - MULTIPLY(tmp2, FIX(1.435427942))         /* c2 */
	      + MULTIPLY(tmp4, FIX(0.621472312)),        /* c8 */
	      CONST_BITS+2);
    dataptr[DCTSIZE*6] = (DCTELEM)
      DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
	      - MULTIPLY(tmp2, FIX(0.834379234)),        /* c8+c10 */
	      CONST_BITS+2);

    /* Odd part */

    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544));    /* c3 */
    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199));    /* c5 */
    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568));    /* c7 */
    tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
	   + MULTIPLY(tmp14, FIX(0.421479672));          /* c9 */
    tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568));  /* -c7 */
    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167));  /* -c1 */
    tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
	    - MULTIPLY(tmp14, FIX(1.130622199));         /* c5 */
    tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672));   /* c9 */
    tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
	    + MULTIPLY(tmp14, FIX(1.480800167));         /* c1 */
    tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
	    - MULTIPLY(tmp14, FIX(1.360834544));         /* c3 */

    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);

    dataptr++;			/* advance pointer to next column */
    wsptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 12x12 sample block.
 */

GLOBAL(void)
jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
  DCTELEM workspace[8*4];
  DCTELEM *dataptr;
  DCTELEM *wsptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
  /* cK represents sqrt(2) * cos(K*pi/24). */

  dataptr = data;
  ctr = 0;
  for (;;) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);

    tmp10 = tmp0 + tmp5;
    tmp13 = tmp0 - tmp5;
    tmp11 = tmp1 + tmp4;
    tmp14 = tmp1 - tmp4;
    tmp12 = tmp2 + tmp3;
    tmp15 = tmp2 - tmp3;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
    dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
    dataptr[4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
	      CONST_BITS);
    dataptr[2] = (DCTELEM)
      DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
	      CONST_BITS);

    /* Odd part */

    tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
    tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
    tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
	    + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
	    + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
	    - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
	    - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */

    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);

    ctr++;

    if (ctr != DCTSIZE) {
      if (ctr == 12)
	break;			/* Done. */
      dataptr += DCTSIZE;	/* advance pointer to next row */
    } else
      dataptr = workspace;	/* switch pointer to extended workspace */
  }

  /* Pass 2: process columns.
   * We leave the results scaled up by an overall factor of 8.
   * We must also scale the output by (8/12)**2 = 4/9, which we partially
   * fold into the constant multipliers and final shifting:
   * cK now represents sqrt(2) * cos(K*pi/24) * 8/9.
   */

  dataptr = data;
  wsptr = workspace;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];

    tmp10 = tmp0 + tmp5;
    tmp13 = tmp0 - tmp5;
    tmp11 = tmp1 + tmp4;
    tmp14 = tmp1 - tmp4;
    tmp12 = tmp2 + tmp3;
    tmp15 = tmp2 - tmp3;

    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
    tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
	      CONST_BITS+1);
    dataptr[DCTSIZE*6] = (DCTELEM)
      DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
	      CONST_BITS+1);
    dataptr[DCTSIZE*4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
	      CONST_BITS+1);
    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
	      MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
	      CONST_BITS+1);

    /* Odd part */

    tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
    tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
    tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
	    + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
	    + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
	    - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
	    - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */

    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+1);
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+1);
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+1);
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+1);

    dataptr++;			/* advance pointer to next column */
    wsptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 13x13 sample block.
 */

GLOBAL(void)
jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
  INT32 z1, z2;
  DCTELEM workspace[8*5];
  DCTELEM *dataptr;
  DCTELEM *wsptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
  /* cK represents sqrt(2) * cos(K*pi/26). */

  dataptr = data;
  ctr = 0;
  for (;;) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
    tmp6 = GETJSAMPLE(elemptr[6]);

    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
    tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
    tmp6 += tmp6;
    tmp0 -= tmp6;
    tmp1 -= tmp6;
    tmp2 -= tmp6;
    tmp3 -= tmp6;
    tmp4 -= tmp6;
    tmp5 -= tmp6;
    dataptr[2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) +   /* c2 */
	      MULTIPLY(tmp1, FIX(1.058554052)) +   /* c6 */
	      MULTIPLY(tmp2, FIX(0.501487041)) -   /* c10 */
	      MULTIPLY(tmp3, FIX(0.170464608)) -   /* c12 */
	      MULTIPLY(tmp4, FIX(0.803364869)) -   /* c8 */
	      MULTIPLY(tmp5, FIX(1.252223920)),    /* c4 */
	      CONST_BITS);
    z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
	 MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
	 MULTIPLY(tmp1 - tmp5, FIX(0.316450131));  /* (c8-c12)/2 */
    z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
	 MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
	 MULTIPLY(tmp1 + tmp5, FIX(0.486914739));  /* (c8+c12)/2 */

    dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
    dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);

    /* Odd part */

    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651));   /* c3 */
    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945));   /* c5 */
    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) +  /* c7 */
	   MULTIPLY(tmp14 + tmp15, FIX(0.338443458));   /* c11 */
    tmp0 = tmp1 + tmp2 + tmp3 -
	   MULTIPLY(tmp10, FIX(2.020082300)) +          /* c3+c5+c7-c1 */
	   MULTIPLY(tmp14, FIX(0.318774355));           /* c9-c11 */
    tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) -  /* c7 */
	   MULTIPLY(tmp11 + tmp12, FIX(0.338443458));   /* c11 */
    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
    tmp1 += tmp4 + tmp5 +
	    MULTIPLY(tmp11, FIX(0.837223564)) -         /* c5+c9+c11-c3 */
	    MULTIPLY(tmp14, FIX(2.341699410));          /* c1+c7 */
    tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
    tmp2 += tmp4 + tmp6 -
	    MULTIPLY(tmp12, FIX(1.572116027)) +         /* c1+c5-c9-c11 */
	    MULTIPLY(tmp15, FIX(2.260109708));          /* c3+c7 */
    tmp3 += tmp5 + tmp6 +
	    MULTIPLY(tmp13, FIX(2.205608352)) -         /* c3+c5+c9-c7 */
	    MULTIPLY(tmp15, FIX(1.742345811));          /* c1+c11 */

    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);

    ctr++;

    if (ctr != DCTSIZE) {
      if (ctr == 13)
	break;			/* Done. */
      dataptr += DCTSIZE;	/* advance pointer to next row */
    } else
      dataptr = workspace;	/* switch pointer to extended workspace */
  }

  /* Pass 2: process columns.
   * We leave the results scaled up by an overall factor of 8.
   * We must also scale the output by (8/13)**2 = 64/169, which we partially
   * fold into the constant multipliers and final shifting:
   * cK now represents sqrt(2) * cos(K*pi/26) * 128/169.
   */

  dataptr = data;
  wsptr = workspace;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*4];
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*3];
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*2];
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*1];
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*0];
    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*7];
    tmp6 = dataptr[DCTSIZE*6];

    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*4];
    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*3];
    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*2];
    tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*1];
    tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*0];
    tmp15 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*7];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
		       FIX(0.757396450)),          /* 128/169 */
	      CONST_BITS+1);
    tmp6 += tmp6;
    tmp0 -= tmp6;
    tmp1 -= tmp6;
    tmp2 -= tmp6;
    tmp3 -= tmp6;
    tmp4 -= tmp6;
    tmp5 -= tmp6;
    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) +   /* c2 */
	      MULTIPLY(tmp1, FIX(0.801745081)) +   /* c6 */
	      MULTIPLY(tmp2, FIX(0.379824504)) -   /* c10 */
	      MULTIPLY(tmp3, FIX(0.129109289)) -   /* c12 */
	      MULTIPLY(tmp4, FIX(0.608465700)) -   /* c8 */
	      MULTIPLY(tmp5, FIX(0.948429952)),    /* c4 */
	      CONST_BITS+1);
    z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
	 MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
	 MULTIPLY(tmp1 - tmp5, FIX(0.239678205));  /* (c8-c12)/2 */
    z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
	 MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
	 MULTIPLY(tmp1 + tmp5, FIX(0.368787494));  /* (c8+c12)/2 */

    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+1);
    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+1);

    /* Odd part */

    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908));   /* c3 */
    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751));   /* c5 */
    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) +  /* c7 */
	   MULTIPLY(tmp14 + tmp15, FIX(0.256335874));   /* c11 */
    tmp0 = tmp1 + tmp2 + tmp3 -
	   MULTIPLY(tmp10, FIX(1.530003162)) +          /* c3+c5+c7-c1 */
	   MULTIPLY(tmp14, FIX(0.241438564));           /* c9-c11 */
    tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) -  /* c7 */
	   MULTIPLY(tmp11 + tmp12, FIX(0.256335874));   /* c11 */
    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
    tmp1 += tmp4 + tmp5 +
	    MULTIPLY(tmp11, FIX(0.634110155)) -         /* c5+c9+c11-c3 */
	    MULTIPLY(tmp14, FIX(1.773594819));          /* c1+c7 */
    tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
    tmp2 += tmp4 + tmp6 -
	    MULTIPLY(tmp12, FIX(1.190715098)) +         /* c1+c5-c9-c11 */
	    MULTIPLY(tmp15, FIX(1.711799069));          /* c3+c7 */
    tmp3 += tmp5 + tmp6 +
	    MULTIPLY(tmp13, FIX(1.670519935)) -         /* c3+c5+c9-c7 */
	    MULTIPLY(tmp15, FIX(1.319646532));          /* c1+c11 */

    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+1);
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+1);
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+1);
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+1);

    dataptr++;			/* advance pointer to next column */
    wsptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 14x14 sample block.
 */

GLOBAL(void)
jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
  DCTELEM workspace[8*6];
  DCTELEM *dataptr;
  DCTELEM *wsptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
  /* cK represents sqrt(2) * cos(K*pi/28). */

  dataptr = data;
  ctr = 0;
  for (;;) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
    tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);

    tmp10 = tmp0 + tmp6;
    tmp14 = tmp0 - tmp6;
    tmp11 = tmp1 + tmp5;
    tmp15 = tmp1 - tmp5;
    tmp12 = tmp2 + tmp4;
    tmp16 = tmp2 - tmp4;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
    tmp13 += tmp13;
    dataptr[4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
	      MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
	      MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
	      CONST_BITS);

    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */

    dataptr[2] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
	      + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
	      CONST_BITS);
    dataptr[6] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
	      - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
	      CONST_BITS);

    /* Odd part */

    tmp10 = tmp1 + tmp2;
    tmp11 = tmp5 - tmp4;
    dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
    tmp3 <<= CONST_BITS;
    tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
    tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
    tmp10 += tmp11 - tmp3;
    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
	    MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
    dataptr[5] = (DCTELEM)
      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
	      + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
	      CONST_BITS);
    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
	    MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
    dataptr[3] = (DCTELEM)
      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
	      - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
	      CONST_BITS);
    dataptr[1] = (DCTELEM)
      DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
	      MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
	      CONST_BITS);

    ctr++;

    if (ctr != DCTSIZE) {
      if (ctr == 14)
	break;			/* Done. */
      dataptr += DCTSIZE;	/* advance pointer to next row */
    } else
      dataptr = workspace;	/* switch pointer to extended workspace */
  }

  /* Pass 2: process columns.
   * We leave the results scaled up by an overall factor of 8.
   * We must also scale the output by (8/14)**2 = 16/49, which we partially
   * fold into the constant multipliers and final shifting:
   * cK now represents sqrt(2) * cos(K*pi/28) * 32/49.
   */

  dataptr = data;
  wsptr = workspace;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
    tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
    tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];

    tmp10 = tmp0 + tmp6;
    tmp14 = tmp0 - tmp6;
    tmp11 = tmp1 + tmp5;
    tmp15 = tmp1 - tmp5;
    tmp12 = tmp2 + tmp4;
    tmp16 = tmp2 - tmp4;

    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
    tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
		       FIX(0.653061224)),                 /* 32/49 */
	      CONST_BITS+1);
    tmp13 += tmp13;
    dataptr[DCTSIZE*4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
	      MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
	      MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
	      CONST_BITS+1);

    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */

    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
	      + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
	      CONST_BITS+1);
    dataptr[DCTSIZE*6] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
	      - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
	      CONST_BITS+1);

    /* Odd part */

    tmp10 = tmp1 + tmp2;
    tmp11 = tmp5 - tmp4;
    dataptr[DCTSIZE*7] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
		       FIX(0.653061224)),                 /* 32/49 */
	      CONST_BITS+1);
    tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
    tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
    tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
    tmp10 += tmp11 - tmp3;
    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
	    MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
    dataptr[DCTSIZE*5] = (DCTELEM)
      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
	      + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
	      CONST_BITS+1);
    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
	    MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
    dataptr[DCTSIZE*3] = (DCTELEM)
      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
	      - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
	      CONST_BITS+1);
    dataptr[DCTSIZE*1] = (DCTELEM)
      DESCALE(tmp11 + tmp12 + tmp3
	      - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
	      - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
	      CONST_BITS+1);

    dataptr++;			/* advance pointer to next column */
    wsptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 15x15 sample block.
 */

GLOBAL(void)
jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
  INT32 z1, z2, z3;
  DCTELEM workspace[8*7];
  DCTELEM *dataptr;
  DCTELEM *wsptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
  /* cK represents sqrt(2) * cos(K*pi/30). */

  dataptr = data;
  ctr = 0;
  for (;;) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
    tmp7 = GETJSAMPLE(elemptr[7]);

    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
    tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
    tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);

    z1 = tmp0 + tmp4 + tmp5;
    z2 = tmp1 + tmp3 + tmp6;
    z3 = tmp2 + tmp7;
    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
    z3 += z3;
    dataptr[6] = (DCTELEM)
      DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
	      MULTIPLY(z2 - z3, FIX(0.437016024)),  /* c12 */
	      CONST_BITS);
    tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
    z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) -  /* c2+c14 */
         MULTIPLY(tmp6 - tmp2, FIX(2.238241955));   /* c4+c8 */
    z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) -  /* c8-c14 */
	 MULTIPLY(tmp0 - tmp2, FIX(0.091361227));   /* c2-c4 */
    z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) +  /* c2 */
	 MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) +  /* c8 */
	 MULTIPLY(tmp1 - tmp4, FIX(0.790569415));   /* (c6+c12)/2 */

    dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
    dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);

    /* Odd part */

    tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
		    FIX(1.224744871));                         /* c5 */
    tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
	   MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876));  /* c9 */
    tmp12 = MULTIPLY(tmp12, FIX(1.224744871));                 /* c5 */
    tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) +         /* c1 */
	   MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) +         /* c3 */
	   MULTIPLY(tmp13 + tmp15, FIX(0.575212477));          /* c11 */
    tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) -                 /* c7-c11 */
	   MULTIPLY(tmp14, FIX(0.513743148)) +                 /* c3-c9 */
	   MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12;   /* c1+c13 */
    tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) -               /* -(c1-c7) */
	   MULTIPLY(tmp11, FIX(2.176250899)) -                 /* c3+c9 */
	   MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12;   /* c11+c13 */

    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);

    ctr++;

    if (ctr != DCTSIZE) {
      if (ctr == 15)
	break;			/* Done. */
      dataptr += DCTSIZE;	/* advance pointer to next row */
    } else
      dataptr = workspace;	/* switch pointer to extended workspace */
  }

  /* Pass 2: process columns.
   * We leave the results scaled up by an overall factor of 8.
   * We must also scale the output by (8/15)**2 = 64/225, which we partially
   * fold into the constant multipliers and final shifting:
   * cK now represents sqrt(2) * cos(K*pi/30) * 256/225.
   */

  dataptr = data;
  wsptr = workspace;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*6];
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*5];
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*4];
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*3];
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*2];
    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*1];
    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*0];
    tmp7 = dataptr[DCTSIZE*7];

    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*6];
    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*5];
    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*4];
    tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*3];
    tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*2];
    tmp15 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*1];
    tmp16 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*0];

    z1 = tmp0 + tmp4 + tmp5;
    z2 = tmp1 + tmp3 + tmp6;
    z3 = tmp2 + tmp7;
    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
	      CONST_BITS+2);
    z3 += z3;
    dataptr[DCTSIZE*6] = (DCTELEM)
      DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
	      MULTIPLY(z2 - z3, FIX(0.497227121)),  /* c12 */
	      CONST_BITS+2);
    tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
    z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) -  /* c2+c14 */
         MULTIPLY(tmp6 - tmp2, FIX(2.546621957));   /* c4+c8 */
    z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) -  /* c8-c14 */
	 MULTIPLY(tmp0 - tmp2, FIX(0.103948774));   /* c2-c4 */
    z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) +  /* c2 */
	 MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) +  /* c8 */
	 MULTIPLY(tmp1 - tmp4, FIX(0.899492312));   /* (c6+c12)/2 */

    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+2);
    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+2);

    /* Odd part */

    tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
		    FIX(1.393487498));                         /* c5 */
    tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
	   MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187));  /* c9 */
    tmp12 = MULTIPLY(tmp12, FIX(1.393487498));                 /* c5 */
    tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) +         /* c1 */
	   MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) +         /* c3 */
	   MULTIPLY(tmp13 + tmp15, FIX(0.654463974));          /* c11 */
    tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) -                 /* c7-c11 */
	   MULTIPLY(tmp14, FIX(0.584525538)) +                 /* c3-c9 */
	   MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12;   /* c1+c13 */
    tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) -               /* -(c1-c7) */
	   MULTIPLY(tmp11, FIX(2.476089912)) -                 /* c3+c9 */
	   MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12;   /* c11+c13 */

    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);

    dataptr++;			/* advance pointer to next column */
    wsptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 16x16 sample block.
 */

GLOBAL(void)
jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
  DCTELEM workspace[DCTSIZE2];
  DCTELEM *dataptr;
  DCTELEM *wsptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* cK represents sqrt(2) * cos(K*pi/32). */

  dataptr = data;
  ctr = 0;
  for (;;) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
    tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);

    tmp10 = tmp0 + tmp7;
    tmp14 = tmp0 - tmp7;
    tmp11 = tmp1 + tmp6;
    tmp15 = tmp1 - tmp6;
    tmp12 = tmp2 + tmp5;
    tmp16 = tmp2 - tmp5;
    tmp13 = tmp3 + tmp4;
    tmp17 = tmp3 - tmp4;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
    tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
    dataptr[4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
	      CONST_BITS-PASS1_BITS);

    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */

    dataptr[2] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
	      CONST_BITS-PASS1_BITS);
    dataptr[6] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
	      CONST_BITS-PASS1_BITS);

    /* Odd part */

    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
    tmp10 = tmp11 + tmp12 + tmp13 -
	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */

    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);

    ctr++;

    if (ctr != DCTSIZE) {
      if (ctr == DCTSIZE * 2)
	break;			/* Done. */
      dataptr += DCTSIZE;	/* advance pointer to next row */
    } else
      dataptr = workspace;	/* switch pointer to extended workspace */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * We must also scale the output by (8/16)**2 = 1/2**2.
   */

  dataptr = data;
  wsptr = workspace;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
    tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];

    tmp10 = tmp0 + tmp7;
    tmp14 = tmp0 - tmp7;
    tmp11 = tmp1 + tmp6;
    tmp15 = tmp1 - tmp6;
    tmp12 = tmp2 + tmp5;
    tmp16 = tmp2 - tmp5;
    tmp13 = tmp3 + tmp4;
    tmp17 = tmp3 - tmp4;

    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
    tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
    tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+2);
    dataptr[DCTSIZE*4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
	      CONST_BITS+PASS1_BITS+2);

    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */

    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+10 */
	      CONST_BITS+PASS1_BITS+2);
    dataptr[DCTSIZE*6] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
	      CONST_BITS+PASS1_BITS+2);

    /* Odd part */

    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
    tmp10 = tmp11 + tmp12 + tmp13 -
	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */

    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+2);
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+2);
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+2);
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+2);

    dataptr++;			/* advance pointer to next column */
    wsptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 16x8 sample block.
 *
 * 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
  INT32 z1;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32). */

  dataptr = data;
  ctr = 0;
  for (ctr = 0; ctr < DCTSIZE; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
    tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);

    tmp10 = tmp0 + tmp7;
    tmp14 = tmp0 - tmp7;
    tmp11 = tmp1 + tmp6;
    tmp15 = tmp1 - tmp6;
    tmp12 = tmp2 + tmp5;
    tmp16 = tmp2 - tmp5;
    tmp13 = tmp3 + tmp4;
    tmp17 = tmp3 - tmp4;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
    tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
    dataptr[4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
	      CONST_BITS-PASS1_BITS);

    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */

    dataptr[2] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
	      CONST_BITS-PASS1_BITS);
    dataptr[6] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
	      CONST_BITS-PASS1_BITS);

    /* Odd part */

    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
    tmp10 = tmp11 + tmp12 + tmp13 -
	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */

    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * We must also scale the output by 8/16 = 1/2.
   */

  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
     */

    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];

    tmp10 = tmp0 + tmp3;
    tmp12 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp13 = tmp1 - tmp2;

    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];

    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);

    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
					   CONST_BITS+PASS1_BITS+1);
    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
					   CONST_BITS+PASS1_BITS+1);

    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
     * i0..i3 in the paper are tmp0..tmp3 here.
     */

    tmp10 = tmp0 + tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp0 + tmp2;
    tmp13 = tmp1 + tmp3;
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */

    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */

    tmp12 += z1;
    tmp13 += z1;

    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12,
					   CONST_BITS+PASS1_BITS+1);
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13,
					   CONST_BITS+PASS1_BITS+1);
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12,
					   CONST_BITS+PASS1_BITS+1);
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13,
					   CONST_BITS+PASS1_BITS+1);

    dataptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 14x7 sample block.
 *
 * 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
  INT32 z1, z2, z3;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Zero bottom row of output coefficient block. */
  MEMZERO(&data[DCTSIZE*7], SIZEOF(DCTELEM) * DCTSIZE);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28). */

  dataptr = data;
  for (ctr = 0; ctr < 7; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
    tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);

    tmp10 = tmp0 + tmp6;
    tmp14 = tmp0 - tmp6;
    tmp11 = tmp1 + tmp5;
    tmp15 = tmp1 - tmp5;
    tmp12 = tmp2 + tmp4;
    tmp16 = tmp2 - tmp4;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
    tmp13 += tmp13;
    dataptr[4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
	      MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
	      MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
	      CONST_BITS-PASS1_BITS);

    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */

    dataptr[2] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
	      + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
	      CONST_BITS-PASS1_BITS);
    dataptr[6] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
	      - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
	      CONST_BITS-PASS1_BITS);

    /* Odd part */

    tmp10 = tmp1 + tmp2;
    tmp11 = tmp5 - tmp4;
    dataptr[7] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS);
    tmp3 <<= CONST_BITS;
    tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
    tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
    tmp10 += tmp11 - tmp3;
    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
	    MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
    dataptr[5] = (DCTELEM)
      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
	      + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
	      CONST_BITS-PASS1_BITS);
    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
	    MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
    dataptr[3] = (DCTELEM)
      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
	      - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
	      CONST_BITS-PASS1_BITS);
    dataptr[1] = (DCTELEM)
      DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
	      MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
	      CONST_BITS-PASS1_BITS);

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * We must also scale the output by (8/14)*(8/7) = 32/49, which we
   * partially fold into the constant multipliers and final shifting:
   * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14) * 64/49.
   */

  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
    tmp3 = dataptr[DCTSIZE*3];

    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];

    z1 = tmp0 + tmp2;
    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
	      CONST_BITS+PASS1_BITS+1);
    tmp3 += tmp3;
    z1 -= tmp3;
    z1 -= tmp3;
    z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
    z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+1);
    z1 -= z2;
    z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
    dataptr[DCTSIZE*4] = (DCTELEM)
      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
	      CONST_BITS+PASS1_BITS+1);
    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+1);

    /* Odd part */

    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
    tmp0 = tmp1 - tmp2;
    tmp1 += tmp2;
    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
    tmp1 += tmp2;
    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
    tmp0 += tmp3;
    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */

    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);

    dataptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 12x6 sample block.
 *
 * 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Zero 2 bottom rows of output coefficient block. */
  MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24). */

  dataptr = data;
  for (ctr = 0; ctr < 6; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);

    tmp10 = tmp0 + tmp5;
    tmp13 = tmp0 - tmp5;
    tmp11 = tmp1 + tmp4;
    tmp14 = tmp1 - tmp4;
    tmp12 = tmp2 + tmp3;
    tmp15 = tmp2 - tmp3;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
    dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
    dataptr[4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
	      CONST_BITS-PASS1_BITS);
    dataptr[2] = (DCTELEM)
      DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
	      CONST_BITS-PASS1_BITS);

    /* Odd part */

    tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
    tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
    tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
	    + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
	    + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
	    - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
	    - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */

    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * We must also scale the output by (8/12)*(8/6) = 8/9, which we
   * partially fold into the constant multipliers and final shifting:
   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
   */

  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];

    tmp10 = tmp0 + tmp2;
    tmp12 = tmp0 - tmp2;

    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
	      CONST_BITS+PASS1_BITS+1);
    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
	      CONST_BITS+PASS1_BITS+1);
    dataptr[DCTSIZE*4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
	      CONST_BITS+PASS1_BITS+1);

    /* Odd part */

    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */

    dataptr[DCTSIZE*1] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
	      CONST_BITS+PASS1_BITS+1);
    dataptr[DCTSIZE*3] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
	      CONST_BITS+PASS1_BITS+1);
    dataptr[DCTSIZE*5] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
	      CONST_BITS+PASS1_BITS+1);

    dataptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 10x5 sample block.
 *
 * 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Zero 3 bottom rows of output coefficient block. */
  MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20). */

  dataptr = data;
  for (ctr = 0; ctr < 5; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
    tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);

    tmp10 = tmp0 + tmp4;
    tmp13 = tmp0 - tmp4;
    tmp11 = tmp1 + tmp3;
    tmp14 = tmp1 - tmp3;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
    tmp12 += tmp12;
    dataptr[4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
	      MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
	      CONST_BITS-PASS1_BITS);
    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
    dataptr[2] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
	      CONST_BITS-PASS1_BITS);
    dataptr[6] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
	      CONST_BITS-PASS1_BITS);

    /* Odd part */

    tmp10 = tmp0 + tmp4;
    tmp11 = tmp1 - tmp3;
    dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);
    tmp2 <<= CONST_BITS;
    dataptr[1] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
	      MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
	      MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
	      MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
	      CONST_BITS-PASS1_BITS);
    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
	    MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
	    (tmp11 << (CONST_BITS - 1)) - tmp2;
    dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
    dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * We must also scale the output by (8/10)*(8/5) = 32/25, which we
   * fold into the constant multipliers:
   * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10) * 32/25.
   */

  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
    tmp2 = dataptr[DCTSIZE*2];

    tmp10 = tmp0 + tmp1;
    tmp11 = tmp0 - tmp1;

    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
	      CONST_BITS+PASS1_BITS);
    tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
    tmp10 -= tmp2 << 2;
    tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);

    /* Odd part */

    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */

    dataptr[DCTSIZE*1] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*3] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
	      CONST_BITS+PASS1_BITS);

    dataptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on an 8x4 sample block.
 *
 * 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3;
  INT32 tmp10, tmp11, tmp12, tmp13;
  INT32 z1;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Zero 4 bottom rows of output coefficient block. */
  MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* We must also scale the output by 8/4 = 2, which we add here. */

  dataptr = data;
  for (ctr = 0; ctr < 4; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part per LL&M figure 1 --- note that published figure is faulty;
     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
     */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);

    tmp10 = tmp0 + tmp3;
    tmp12 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp13 = tmp1 - tmp2;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));

    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
    /* Add fudge factor here for final descale. */
    z1 += ONE << (CONST_BITS-PASS1_BITS-2);
    dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
				       CONST_BITS-PASS1_BITS-1);
    dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
				       CONST_BITS-PASS1_BITS-1);

    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
     * i0..i3 in the paper are tmp0..tmp3 here.
     */

    tmp10 = tmp0 + tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp0 + tmp2;
    tmp13 = tmp1 + tmp3;
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
    /* Add fudge factor here for final descale. */
    z1 += ONE << (CONST_BITS-PASS1_BITS-2);

    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */

    tmp12 += z1;
    tmp13 += z1;

    dataptr[1] = (DCTELEM)
      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS-1);
    dataptr[3] = (DCTELEM)
      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS-1);
    dataptr[5] = (DCTELEM)
      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS-1);
    dataptr[7] = (DCTELEM)
      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS-1);

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
   */

  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    /* Even part */

    /* Add fudge factor here for final descale. */
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];

    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];

    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
    dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);

    /* Odd part */

    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);   /* c6 */
    /* Add fudge factor here for final descale. */
    tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);

    dataptr[DCTSIZE*1] = (DCTELEM)
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
		  CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*3] = (DCTELEM)
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
		  CONST_BITS+PASS1_BITS);

    dataptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 6x3 sample block.
 *
 * 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2;
  INT32 tmp10, tmp11, tmp12;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* We scale the results further by 2 as part of output adaption */
  /* scaling for different DCT size. */
  /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */

  dataptr = data;
  for (ctr = 0; ctr < 3; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);

    tmp10 = tmp0 + tmp2;
    tmp12 = tmp0 - tmp2;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
    dataptr[2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
	      CONST_BITS-PASS1_BITS-1);
    dataptr[4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
	      CONST_BITS-PASS1_BITS-1);

    /* Odd part */

    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
		    CONST_BITS-PASS1_BITS-1);

    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
   * fold into the constant multipliers (other part was done in pass 1):
   * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6) * 16/9.
   */

  dataptr = data;
  for (ctr = 0; ctr < 6; ctr++) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
    tmp1 = dataptr[DCTSIZE*1];

    tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
	      CONST_BITS+PASS1_BITS);

    /* Odd part */

    dataptr[DCTSIZE*1] = (DCTELEM)
      DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
	      CONST_BITS+PASS1_BITS);

    dataptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 4x2 sample block.
 *
 * 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1;
  INT32 tmp10, tmp11;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* We must also scale the output by (8/4)*(8/2) = 2**3, which we add here. */
  /* 4-point FDCT kernel, */
  /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */

  dataptr = data;
  for (ctr = 0; ctr < 2; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);

    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));

    /* Odd part */

    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
    /* Add fudge factor here for final descale. */
    tmp0 += ONE << (CONST_BITS-PASS1_BITS-4);

    dataptr[1] = (DCTELEM)
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
		  CONST_BITS-PASS1_BITS-3);
    dataptr[3] = (DCTELEM)
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
		  CONST_BITS-PASS1_BITS-3);

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   */

  dataptr = data;
  for (ctr = 0; ctr < 4; ctr++) {
    /* Even part */

    /* Add fudge factor here for final descale. */
    tmp0 = dataptr[DCTSIZE*0] + (ONE << (PASS1_BITS-1));
    tmp1 = dataptr[DCTSIZE*1];

    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);

    /* Odd part */

    dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);

    dataptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 2x1 sample block.
 *
 * 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1;
  JSAMPROW elemptr;

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  elemptr = sample_data[0] + start_col;

  tmp0 = GETJSAMPLE(elemptr[0]);
  tmp1 = GETJSAMPLE(elemptr[1]);

  /* We leave the results scaled up by an overall factor of 8.
   * We must also scale the output by (8/2)*(8/1) = 2**5.
   */

  /* Even part */
  /* Apply unsigned->signed conversion */
  data[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);

  /* Odd part */
  data[1] = (DCTELEM) ((tmp0 - tmp1) << 5);
}


/*
 * Perform the forward DCT on an 8x16 sample block.
 *
 * 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
  INT32 z1;
  DCTELEM workspace[DCTSIZE2];
  DCTELEM *dataptr;
  DCTELEM *wsptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */

  dataptr = data;
  ctr = 0;
  for (;;) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part per LL&M figure 1 --- note that published figure is faulty;
     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
     */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);

    tmp10 = tmp0 + tmp3;
    tmp12 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp13 = tmp1 - tmp2;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);

    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
				   CONST_BITS-PASS1_BITS);
    dataptr[6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
				   CONST_BITS-PASS1_BITS);

    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
     * i0..i3 in the paper are tmp0..tmp3 here.
     */

    tmp10 = tmp0 + tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp0 + tmp2;
    tmp13 = tmp1 + tmp3;
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */

    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */

    tmp12 += z1;
    tmp13 += z1;

    dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
    dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
    dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
    dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);

    ctr++;

    if (ctr != DCTSIZE) {
      if (ctr == DCTSIZE * 2)
	break;			/* Done. */
      dataptr += DCTSIZE;	/* advance pointer to next row */
    } else
      dataptr = workspace;	/* switch pointer to extended workspace */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * We must also scale the output by 8/16 = 1/2.
   * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
   */

  dataptr = data;
  wsptr = workspace;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
    tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];

    tmp10 = tmp0 + tmp7;
    tmp14 = tmp0 - tmp7;
    tmp11 = tmp1 + tmp6;
    tmp15 = tmp1 - tmp6;
    tmp12 = tmp2 + tmp5;
    tmp16 = tmp2 - tmp5;
    tmp13 = tmp3 + tmp4;
    tmp17 = tmp3 - tmp4;

    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
    tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
    tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+1);
    dataptr[DCTSIZE*4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
	      CONST_BITS+PASS1_BITS+1);

    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */

    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
	      CONST_BITS+PASS1_BITS+1);
    dataptr[DCTSIZE*6] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
	      CONST_BITS+PASS1_BITS+1);

    /* Odd part */

    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
    tmp10 = tmp11 + tmp12 + tmp13 -
	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */

    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+1);
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+1);
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+1);
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+1);

    dataptr++;			/* advance pointer to next column */
    wsptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 7x14 sample block.
 *
 * 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
  INT32 z1, z2, z3;
  DCTELEM workspace[8*6];
  DCTELEM *dataptr;
  DCTELEM *wsptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14). */

  dataptr = data;
  ctr = 0;
  for (;;) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
    tmp3 = GETJSAMPLE(elemptr[3]);

    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);

    z1 = tmp0 + tmp2;
    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
    tmp3 += tmp3;
    z1 -= tmp3;
    z1 -= tmp3;
    z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
    z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
    dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
    z1 -= z2;
    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
    dataptr[4] = (DCTELEM)
      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
	      CONST_BITS-PASS1_BITS);
    dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);

    /* Odd part */

    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
    tmp0 = tmp1 - tmp2;
    tmp1 += tmp2;
    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
    tmp1 += tmp2;
    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
    tmp0 += tmp3;
    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */

    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);

    ctr++;

    if (ctr != DCTSIZE) {
      if (ctr == 14)
	break;			/* Done. */
      dataptr += DCTSIZE;	/* advance pointer to next row */
    } else
      dataptr = workspace;	/* switch pointer to extended workspace */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * We must also scale the output by (8/7)*(8/14) = 32/49, which we
   * fold into the constant multipliers:
   * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28) * 32/49.
   */

  dataptr = data;
  wsptr = workspace;
  for (ctr = 0; ctr < 7; ctr++) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
    tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
    tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];

    tmp10 = tmp0 + tmp6;
    tmp14 = tmp0 - tmp6;
    tmp11 = tmp1 + tmp5;
    tmp15 = tmp1 - tmp5;
    tmp12 = tmp2 + tmp4;
    tmp16 = tmp2 - tmp4;

    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
    tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
		       FIX(0.653061224)),                 /* 32/49 */
	      CONST_BITS+PASS1_BITS);
    tmp13 += tmp13;
    dataptr[DCTSIZE*4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
	      MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
	      MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
	      CONST_BITS+PASS1_BITS);

    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */

    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
	      + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*6] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
	      - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
	      CONST_BITS+PASS1_BITS);

    /* Odd part */

    tmp10 = tmp1 + tmp2;
    tmp11 = tmp5 - tmp4;
    dataptr[DCTSIZE*7] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
		       FIX(0.653061224)),                 /* 32/49 */
	      CONST_BITS+PASS1_BITS);
    tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
    tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
    tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
    tmp10 += tmp11 - tmp3;
    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
	    MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
    dataptr[DCTSIZE*5] = (DCTELEM)
      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
	      + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
	      CONST_BITS+PASS1_BITS);
    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
	    MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
    dataptr[DCTSIZE*3] = (DCTELEM)
      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
	      - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*1] = (DCTELEM)
      DESCALE(tmp11 + tmp12 + tmp3
	      - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
	      - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
	      CONST_BITS+PASS1_BITS);

    dataptr++;			/* advance pointer to next column */
    wsptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 6x12 sample block.
 *
 * 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
  DCTELEM workspace[8*4];
  DCTELEM *dataptr;
  DCTELEM *wsptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */

  dataptr = data;
  ctr = 0;
  for (;;) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);

    tmp10 = tmp0 + tmp2;
    tmp12 = tmp0 - tmp2;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
    dataptr[2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
	      CONST_BITS-PASS1_BITS);
    dataptr[4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
	      CONST_BITS-PASS1_BITS);

    /* Odd part */

    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
		    CONST_BITS-PASS1_BITS);

    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));

    ctr++;

    if (ctr != DCTSIZE) {
      if (ctr == 12)
	break;			/* Done. */
      dataptr += DCTSIZE;	/* advance pointer to next row */
    } else
      dataptr = workspace;	/* switch pointer to extended workspace */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * We must also scale the output by (8/6)*(8/12) = 8/9, which we
   * fold into the constant multipliers:
   * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24) * 8/9.
   */

  dataptr = data;
  wsptr = workspace;
  for (ctr = 0; ctr < 6; ctr++) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];

    tmp10 = tmp0 + tmp5;
    tmp13 = tmp0 - tmp5;
    tmp11 = tmp1 + tmp4;
    tmp14 = tmp1 - tmp4;
    tmp12 = tmp2 + tmp3;
    tmp15 = tmp2 - tmp3;

    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
    tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*6] = (DCTELEM)
      DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
	      MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
	      CONST_BITS+PASS1_BITS);

    /* Odd part */

    tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
    tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
    tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
	    + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
	    + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
	    - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
	    - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */

    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS);

    dataptr++;			/* advance pointer to next column */
    wsptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 5x10 sample block.
 *
 * 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
  DCTELEM workspace[8*2];
  DCTELEM *dataptr;
  DCTELEM *wsptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10). */

  dataptr = data;
  ctr = 0;
  for (;;) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
    tmp2 = GETJSAMPLE(elemptr[2]);

    tmp10 = tmp0 + tmp1;
    tmp11 = tmp0 - tmp1;

    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
    tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
    tmp10 -= tmp2 << 2;
    tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
    dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
    dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);

    /* Odd part */

    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */

    dataptr[1] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
	      CONST_BITS-PASS1_BITS);
    dataptr[3] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
	      CONST_BITS-PASS1_BITS);

    ctr++;

    if (ctr != DCTSIZE) {
      if (ctr == 10)
	break;			/* Done. */
      dataptr += DCTSIZE;	/* advance pointer to next row */
    } else
      dataptr = workspace;	/* switch pointer to extended workspace */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * We must also scale the output by (8/5)*(8/10) = 32/25, which we
   * fold into the constant multipliers:
   * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20) * 32/25.
   */

  dataptr = data;
  wsptr = workspace;
  for (ctr = 0; ctr < 5; ctr++) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
    tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];

    tmp10 = tmp0 + tmp4;
    tmp13 = tmp0 - tmp4;
    tmp11 = tmp1 + tmp3;
    tmp14 = tmp1 - tmp3;

    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
	      CONST_BITS+PASS1_BITS);
    tmp12 += tmp12;
    dataptr[DCTSIZE*4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
	      MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
	      CONST_BITS+PASS1_BITS);
    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*6] = (DCTELEM)
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
	      CONST_BITS+PASS1_BITS);

    /* Odd part */

    tmp10 = tmp0 + tmp4;
    tmp11 = tmp1 - tmp3;
    dataptr[DCTSIZE*5] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
	      CONST_BITS+PASS1_BITS);
    tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
    dataptr[DCTSIZE*1] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
	      MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
	      MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
	      MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
	      CONST_BITS+PASS1_BITS);
    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
	    MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
	    MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS);

    dataptr++;			/* advance pointer to next column */
    wsptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 4x8 sample block.
 *
 * 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3;
  INT32 tmp10, tmp11, tmp12, tmp13;
  INT32 z1;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* We must also scale the output by 8/4 = 2, which we add here. */
  /* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). */

  dataptr = data;
  for (ctr = 0; ctr < DCTSIZE; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);

    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));

    /* Odd part */

    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
    /* Add fudge factor here for final descale. */
    tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);

    dataptr[1] = (DCTELEM)
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
		  CONST_BITS-PASS1_BITS-1);
    dataptr[3] = (DCTELEM)
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
		  CONST_BITS-PASS1_BITS-1);

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   */

  dataptr = data;
  for (ctr = 0; ctr < 4; ctr++) {
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
     */

    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];

    /* Add fudge factor here for final descale. */
    tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
    tmp12 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp13 = tmp1 - tmp2;

    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];

    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
    dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);

    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
    /* Add fudge factor here for final descale. */
    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
    dataptr[DCTSIZE*2] = (DCTELEM)
      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*6] = (DCTELEM)
      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);

    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
     * i0..i3 in the paper are tmp0..tmp3 here.
     */

    tmp10 = tmp0 + tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp0 + tmp2;
    tmp13 = tmp1 + tmp3;
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
    /* Add fudge factor here for final descale. */
    z1 += ONE << (CONST_BITS+PASS1_BITS-1);

    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */

    tmp12 += z1;
    tmp13 += z1;

    dataptr[DCTSIZE*1] = (DCTELEM)
      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*3] = (DCTELEM)
      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*5] = (DCTELEM)
      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*7] = (DCTELEM)
      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);

    dataptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 3x6 sample block.
 *
 * 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1, tmp2;
  INT32 tmp10, tmp11, tmp12;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  /* We scale the results further by 2 as part of output adaption */
  /* scaling for different DCT size. */
  /* 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6). */

  dataptr = data;
  for (ctr = 0; ctr < 6; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
    tmp1 = GETJSAMPLE(elemptr[1]);

    tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM)
      ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
    dataptr[2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
	      CONST_BITS-PASS1_BITS-1);

    /* Odd part */

    dataptr[1] = (DCTELEM)
      DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
	      CONST_BITS-PASS1_BITS-1);

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
   * fold into the constant multipliers (other part was done in pass 1):
   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
   */

  dataptr = data;
  for (ctr = 0; ctr < 3; ctr++) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];

    tmp10 = tmp0 + tmp2;
    tmp12 = tmp0 - tmp2;

    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];

    dataptr[DCTSIZE*0] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*2] = (DCTELEM)
      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*4] = (DCTELEM)
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
	      CONST_BITS+PASS1_BITS);

    /* Odd part */

    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */

    dataptr[DCTSIZE*1] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*3] = (DCTELEM)
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
	      CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*5] = (DCTELEM)
      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
	      CONST_BITS+PASS1_BITS);

    dataptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 2x4 sample block.
 *
 * 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1;
  INT32 tmp10, tmp11;
  DCTELEM *dataptr;
  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
  /* We must also scale the output by (8/2)*(8/4) = 2**3, which we add here. */

  dataptr = data;
  for (ctr = 0; ctr < 4; ctr++) {
    elemptr = sample_data[ctr] + start_col;

    /* Even part */

    tmp0 = GETJSAMPLE(elemptr[0]);
    tmp1 = GETJSAMPLE(elemptr[1]);

    /* Apply unsigned->signed conversion */
    dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);

    /* Odd part */

    dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);

    dataptr += DCTSIZE;		/* advance pointer to next row */
  }

  /* Pass 2: process columns.
   * We leave the results scaled up by an overall factor of 8.
   * 4-point FDCT kernel,
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
   */

  dataptr = data;
  for (ctr = 0; ctr < 2; ctr++) {
    /* Even part */

    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];

    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];

    dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1);
    dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1);

    /* Odd part */

    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
    /* Add fudge factor here for final descale. */
    tmp0 += ONE << (CONST_BITS-1);

    dataptr[DCTSIZE*1] = (DCTELEM)
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
		  CONST_BITS);
    dataptr[DCTSIZE*3] = (DCTELEM)
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
		  CONST_BITS);

    dataptr++;			/* advance pointer to next column */
  }
}


/*
 * Perform the forward DCT on a 1x2 sample block.
 *
 * 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
 */

GLOBAL(void)
jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
  INT32 tmp0, tmp1;

  /* Pre-zero output coefficient block. */
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

  tmp0 = GETJSAMPLE(sample_data[0][start_col]);
  tmp1 = GETJSAMPLE(sample_data[1][start_col]);

  /* We leave the results scaled up by an overall factor of 8.
   * We must also scale the output by (8/1)*(8/2) = 2**5.
   */

  /* Even part */
  /* Apply unsigned->signed conversion */
  data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);

  /* Odd part */
  data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp1) << 5);
}

#endif /* DCT_SCALING_SUPPORTED */
#endif /* DCT_ISLOW_SUPPORTED */

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

Written By

Eduardo Tucci

Software Developer

Brazil

I live in Matão, a small city in Brazil. I studied as Programmer in a College for Software Development in Database.
After finishing the College I have been working with java, c# and Computer Graphics with searches for OpenGL.

OAG Library (OpenGL) Part 1 - Setting Up the Library for an MFC Application

License

Comments and Discussions