From: John Koleszar <jkoleszar@google.com>
Date: Wed, 7 Nov 2012 00:59:01 +0000 (-0800)
Subject: Rough merge of master into experimental
X-Git-Tag: v1.3.0~1217^2~124
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7b8dfcb5a2cfb01ee7a6009d945d06559b564d06;p=libvpx

Rough merge of master into experimental

Creates a merge between the master and experimental branches. Fixes a
number of conflicts in the build system to allow *either* VP8 or VP9
to be built. Specifically either:

  $ configure --disable-vp9 $ configure --disable-vp8
  --disable-unit-tests

VP9 still exports its symbols and files as VP8, so that will be
resolved in the next commit.

Unit tests are broken in VP9, but this isn't a new issue. They are
fixed upstream on origin/experimental as of this writing, but rebasing
this merge proved difficult, so will tackle that in a second merge
commit.

Change-Id: I2b7d852c18efd58d1ebc621b8041fe0260442c21
---

7b8dfcb5a2cfb01ee7a6009d945d06559b564d06
diff --cc configure
index 638d0df1e,b3c5fe90d..bd3bf1641
--- a/configure
+++ b/configure
@@@ -31,7 -33,7 +33,8 @@@ Advanced options
    ${toggle_debug_libs}            in/exclude debug version of libraries
    ${toggle_md5}                   support for output of checksum data
    ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
+   ${toggle_vp8}                   VP8 codec support
 +  ${toggle_vp9}                   VP9 codec support
    ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
    ${toggle_mem_tracker}           track memory usage
    ${toggle_postproc}              postprocessing
@@@ -158,20 -171,24 +172,29 @@@ enable optimization
  enable fast_unaligned #allow unaligned accesses, if supported by hw
  enable md5
  enable spatial_resampling
+ enable multithread
  enable os_support
+ enable temporal_denoising
  
  [ -d ${source_path}/../include ] && enable alt_tree_layout
- for d in vp9; do
 -for d in vp8; do
++for d in vp8 vp9; do
      [ -d ${source_path}/${d} ] && disable alt_tree_layout;
  done
  
  if ! enabled alt_tree_layout; then
  # development environment
+ [ -d ${source_path}/vp8 ] && CODECS="${CODECS} vp8_encoder vp8_decoder"
 +[ -d ${source_path}/vp9 ] && CODECS="${CODECS} vp9_encoder vp9_decoder"
  else
  # customer environment
- [ -f ${source_path}/../include/vpx/vp8cx.h ] && CODECS="${CODECS} vp9_encoder"
- [ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp9_decoder"
+ [ -f ${source_path}/../include/vpx/vp8cx.h ] && CODECS="${CODECS} vp8_encoder"
+ [ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp8_decoder"
++[ -f ${source_path}/../include/vpx/vp9cx.h ] && CODECS="${CODECS} vp9_encoder"
++[ -f ${source_path}/../include/vpx/vp9dx.h ] && CODECS="${CODECS} vp9_decoder"
+ [ -f ${source_path}/../include/vpx/vp8cx.h ] || disable vp8_encoder
+ [ -f ${source_path}/../include/vpx/vp8dx.h ] || disable vp8_decoder
++[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable vp9_encoder
++[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable vp9_decoder
  
  [ -f ${source_path}/../lib/*/*mt.lib ] && soft_enable static_msvcrt
  fi
@@@ -265,8 -274,8 +291,10 @@@ CONFIG_LIST=
      postproc_visualizer
      os_support
      unit_tests
+     multi_res_encoding
+     temporal_denoising
 +    experimental
 +    ${EXPERIMENT_LIST}
  "
  CMDLINE_SELECT="
      extra_warnings
@@@ -306,7 -320,8 +339,9 @@@
      small
      postproc_visualizer
      unit_tests
+     multi_res_encoding
+     temporal_denoising
 +    experimental
  "
  
  process_cmdline() {
@@@ -512,8 -537,10 +569,11 @@@ process_toolchain() 
          check_add_cflags -Wpointer-arith
          check_add_cflags -Wtype-limits
          check_add_cflags -Wcast-qual
-         check_add_cflags -Wundef
 +        check_add_cflags -Wvla
+         check_add_cflags -Wimplicit-function-declaration
+         check_add_cflags -Wuninitialized
+         check_add_cflags -Wunused-variable
+         check_add_cflags -Wunused-but-set-variable
          enabled extra_warnings || check_add_cflags -Wno-unused-function
      fi
  
diff --cc examples.mk
index 74fb68156,90913e67e..0d4b4d5a9
--- a/examples.mk
+++ b/examples.mk
@@@ -97,10 -114,10 +114,12 @@@ vp8_multi_resolution_encoder.DESCRIPTIO
  # We should not link to math library (libm) on RVCT
  # when building for bare-metal targets
  ifeq ($(CONFIG_OS_SUPPORT), yes)
+ CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
 +CODEC_EXTRA_LIBS-$(CONFIG_VP9)         += m
  else
      ifeq ($(CONFIG_GCC), yes)
+     CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
 +    CODEC_EXTRA_LIBS-$(CONFIG_VP9)         += m
      endif
  endif
  #
@@@ -117,8 -134,8 +136,10 @@@ ifeq ($(HAVE_ALT_TREE_LAYOUT),yes
      INC_PATH := $(SRC_PATH_BARE)/../include
  else
      LIB_PATH-yes                     += $(if $(BUILD_PFX),$(BUILD_PFX),.)
+     INC_PATH-$(CONFIG_VP8_DECODER)   += $(SRC_PATH_BARE)/vp8
+     INC_PATH-$(CONFIG_VP8_ENCODER)   += $(SRC_PATH_BARE)/vp8
 +    INC_PATH-$(CONFIG_VP9_DECODER)   += $(SRC_PATH_BARE)/vp9
 +    INC_PATH-$(CONFIG_VP9_ENCODER)   += $(SRC_PATH_BARE)/vp9
      LIB_PATH := $(call enabled,LIB_PATH)
      INC_PATH := $(call enabled,INC_PATH)
  endif
diff --cc libmkv/EbmlIDs.h
index c6c4a6960,e3ce5856f..4920bf9ba
--- a/libmkv/EbmlIDs.h
+++ b/libmkv/EbmlIDs.h
@@@ -10,221 -10,222 +10,221 @@@
  #ifndef MKV_DEFS_HPP
  #define MKV_DEFS_HPP 1
  
- // Commenting out values not available in webm, but available in matroska
+ /* Commenting out values not available in webm, but available in matroska */
  
 -enum mkv
 -{
 -    EBML = 0x1A45DFA3,
 -    EBMLVersion = 0x4286,
 -    EBMLReadVersion = 0x42F7,
 -    EBMLMaxIDLength = 0x42F2,
 -    EBMLMaxSizeLength = 0x42F3,
 -    DocType = 0x4282,
 -    DocTypeVersion = 0x4287,
 -    DocTypeReadVersion = 0x4285,
 +enum mkv {
 +  EBML = 0x1A45DFA3,
 +  EBMLVersion = 0x4286,
 +  EBMLReadVersion = 0x42F7,
 +  EBMLMaxIDLength = 0x42F2,
 +  EBMLMaxSizeLength = 0x42F3,
 +  DocType = 0x4282,
 +  DocTypeVersion = 0x4287,
 +  DocTypeReadVersion = 0x4285,
- //  CRC_32 = 0xBF,
+ /* CRC_32 = 0xBF, */
 -    Void = 0xEC,
 -    SignatureSlot = 0x1B538667,
 -    SignatureAlgo = 0x7E8A,
 -    SignatureHash = 0x7E9A,
 -    SignaturePublicKey = 0x7EA5,
 -    Signature = 0x7EB5,
 -    SignatureElements = 0x7E5B,
 -    SignatureElementList = 0x7E7B,
 -    SignedElement = 0x6532,
 -    /* segment */
 -    Segment = 0x18538067,
 -    /* Meta Seek Information */
 -    SeekHead = 0x114D9B74,
 -    Seek = 0x4DBB,
 -    SeekID = 0x53AB,
 -    SeekPosition = 0x53AC,
 -    /* Segment Information */
 -    Info = 0x1549A966,
 +  Void = 0xEC,
 +  SignatureSlot = 0x1B538667,
 +  SignatureAlgo = 0x7E8A,
 +  SignatureHash = 0x7E9A,
 +  SignaturePublicKey = 0x7EA5,
 +  Signature = 0x7EB5,
 +  SignatureElements = 0x7E5B,
 +  SignatureElementList = 0x7E7B,
 +  SignedElement = 0x6532,
-   // segment
++  /* segment */
 +  Segment = 0x18538067,
-   // Meta Seek Information
++  /* Meta Seek Information */
 +  SeekHead = 0x114D9B74,
 +  Seek = 0x4DBB,
 +  SeekID = 0x53AB,
 +  SeekPosition = 0x53AC,
-   // Segment Information
++  /* Segment Information */
 +  Info = 0x1549A966,
- //  SegmentUID = 0x73A4,
- //  SegmentFilename = 0x7384,
- //  PrevUID = 0x3CB923,
- //  PrevFilename = 0x3C83AB,
- //  NextUID = 0x3EB923,
- //  NextFilename = 0x3E83BB,
- //  SegmentFamily = 0x4444,
- //  ChapterTranslate = 0x6924,
- //  ChapterTranslateEditionUID = 0x69FC,
- //  ChapterTranslateCodec = 0x69BF,
- //  ChapterTranslateID = 0x69A5,
+ /* SegmentUID = 0x73A4, */
+ /* SegmentFilename = 0x7384, */
+ /* PrevUID = 0x3CB923, */
+ /* PrevFilename = 0x3C83AB, */
+ /* NextUID = 0x3EB923, */
+ /* NextFilename = 0x3E83BB, */
+ /* SegmentFamily = 0x4444, */
+ /* ChapterTranslate = 0x6924, */
+ /* ChapterTranslateEditionUID = 0x69FC, */
+ /* ChapterTranslateCodec = 0x69BF, */
+ /* ChapterTranslateID = 0x69A5, */
 -    TimecodeScale = 0x2AD7B1,
 -    Segment_Duration = 0x4489,
 -    DateUTC = 0x4461,
 +  TimecodeScale = 0x2AD7B1,
 +  Segment_Duration = 0x4489,
 +  DateUTC = 0x4461,
- //  Title = 0x7BA9,
+ /* Title = 0x7BA9, */
 -    MuxingApp = 0x4D80,
 -    WritingApp = 0x5741,
 -    /* Cluster */
 -    Cluster = 0x1F43B675,
 -    Timecode = 0xE7,
 +  MuxingApp = 0x4D80,
 +  WritingApp = 0x5741,
-   // Cluster
++  /* Cluster */
 +  Cluster = 0x1F43B675,
 +  Timecode = 0xE7,
- //  SilentTracks = 0x5854,
- //  SilentTrackNumber = 0x58D7,
- //  Position = 0xA7,
+ /* SilentTracks = 0x5854, */
+ /* SilentTrackNumber = 0x58D7, */
+ /* Position = 0xA7, */
 -    PrevSize = 0xAB,
 -    BlockGroup = 0xA0,
 -    Block = 0xA1,
 +  PrevSize = 0xAB,
 +  BlockGroup = 0xA0,
 +  Block = 0xA1,
- //  BlockVirtual = 0xA2,
- //  BlockAdditions = 0x75A1,
- //  BlockMore = 0xA6,
- //  BlockAddID = 0xEE,
- //  BlockAdditional = 0xA5,
+ /* BlockVirtual = 0xA2, */
+ /* BlockAdditions = 0x75A1, */
+ /* BlockMore = 0xA6, */
+ /* BlockAddID = 0xEE, */
+ /* BlockAdditional = 0xA5, */
 -    BlockDuration = 0x9B,
 +  BlockDuration = 0x9B,
- //  ReferencePriority = 0xFA,
+ /* ReferencePriority = 0xFA, */
 -    ReferenceBlock = 0xFB,
 +  ReferenceBlock = 0xFB,
- //  ReferenceVirtual = 0xFD,
- //  CodecState = 0xA4,
- //  Slices = 0x8E,
- //  TimeSlice = 0xE8,
+ /* ReferenceVirtual = 0xFD, */
+ /* CodecState = 0xA4, */
+ /* Slices = 0x8E, */
+ /* TimeSlice = 0xE8, */
 -    LaceNumber = 0xCC,
 +  LaceNumber = 0xCC,
- //  FrameNumber = 0xCD,
- //  BlockAdditionID = 0xCB,
- //  MkvDelay = 0xCE,
- //  Cluster_Duration = 0xCF,
+ /* FrameNumber = 0xCD, */
+ /* BlockAdditionID = 0xCB, */
+ /* MkvDelay = 0xCE, */
+ /* Cluster_Duration = 0xCF, */
 -    SimpleBlock = 0xA3,
 +  SimpleBlock = 0xA3,
- //  EncryptedBlock = 0xAF,
-   // Track
+ /* EncryptedBlock = 0xAF, */
 -    /* Track */
 -    Tracks = 0x1654AE6B,
 -    TrackEntry = 0xAE,
 -    TrackNumber = 0xD7,
 -    TrackUID = 0x73C5,
 -    TrackType = 0x83,
 -    FlagEnabled = 0xB9,
 -    FlagDefault = 0x88,
 -    FlagForced = 0x55AA,
 -    FlagLacing = 0x9C,
++  /* Track */
 +  Tracks = 0x1654AE6B,
 +  TrackEntry = 0xAE,
 +  TrackNumber = 0xD7,
 +  TrackUID = 0x73C5,
 +  TrackType = 0x83,
 +  FlagEnabled = 0xB9,
 +  FlagDefault = 0x88,
 +  FlagForced = 0x55AA,
 +  FlagLacing = 0x9C,
- //  MinCache = 0x6DE7,
- //  MaxCache = 0x6DF8,
+ /* MinCache = 0x6DE7, */
+ /* MaxCache = 0x6DF8, */
 -    DefaultDuration = 0x23E383,
 +  DefaultDuration = 0x23E383,
- //  TrackTimecodeScale = 0x23314F,
- //  TrackOffset = 0x537F,
- //  MaxBlockAdditionID = 0x55EE,
+ /* TrackTimecodeScale = 0x23314F, */
+ /* TrackOffset = 0x537F, */
+ /* MaxBlockAdditionID = 0x55EE, */
 -    Name = 0x536E,
 -    Language = 0x22B59C,
 -    CodecID = 0x86,
 -    CodecPrivate = 0x63A2,
 -    CodecName = 0x258688,
 +  Name = 0x536E,
 +  Language = 0x22B59C,
 +  CodecID = 0x86,
 +  CodecPrivate = 0x63A2,
 +  CodecName = 0x258688,
- //  AttachmentLink = 0x7446,
- //  CodecSettings = 0x3A9697,
- //  CodecInfoURL = 0x3B4040,
- //  CodecDownloadURL = 0x26B240,
- //  CodecDecodeAll = 0xAA,
- //  TrackOverlay = 0x6FAB,
- //  TrackTranslate = 0x6624,
- //  TrackTranslateEditionUID = 0x66FC,
- //  TrackTranslateCodec = 0x66BF,
- //  TrackTranslateTrackID = 0x66A5,
-   // video
+ /* AttachmentLink = 0x7446, */
+ /* CodecSettings = 0x3A9697, */
+ /* CodecInfoURL = 0x3B4040, */
+ /* CodecDownloadURL = 0x26B240, */
+ /* CodecDecodeAll = 0xAA, */
+ /* TrackOverlay = 0x6FAB, */
+ /* TrackTranslate = 0x6624, */
+ /* TrackTranslateEditionUID = 0x66FC, */
+ /* TrackTranslateCodec = 0x66BF, */
+ /* TrackTranslateTrackID = 0x66A5, */
 -    /* video */
 -    Video = 0xE0,
 -    FlagInterlaced = 0x9A,
 -    StereoMode = 0x53B8,
 -    PixelWidth = 0xB0,
 -    PixelHeight = 0xBA,
 -    PixelCropBottom = 0x54AA,
 -    PixelCropTop = 0x54BB,
 -    PixelCropLeft = 0x54CC,
 -    PixelCropRight = 0x54DD,
 -    DisplayWidth = 0x54B0,
 -    DisplayHeight = 0x54BA,
 -    DisplayUnit = 0x54B2,
 -    AspectRatioType = 0x54B3,
++  /* video */
 +  Video = 0xE0,
 +  FlagInterlaced = 0x9A,
 +  StereoMode = 0x53B8,
 +  PixelWidth = 0xB0,
 +  PixelHeight = 0xBA,
 +  PixelCropBottom = 0x54AA,
 +  PixelCropTop = 0x54BB,
 +  PixelCropLeft = 0x54CC,
 +  PixelCropRight = 0x54DD,
 +  DisplayWidth = 0x54B0,
 +  DisplayHeight = 0x54BA,
 +  DisplayUnit = 0x54B2,
 +  AspectRatioType = 0x54B3,
- //  ColourSpace = 0x2EB524,
- //  GammaValue = 0x2FB523,
+ /* ColourSpace = 0x2EB524, */
+ /* GammaValue = 0x2FB523, */
 -    FrameRate = 0x2383E3,
 -    /* end video */
 -    /* audio */
 -    Audio = 0xE1,
 -    SamplingFrequency = 0xB5,
 -    OutputSamplingFrequency = 0x78B5,
 -    Channels = 0x9F,
 +  FrameRate = 0x2383E3,
-   // end video
-   // audio
++  /* end video */
++  /* audio */
 +  Audio = 0xE1,
 +  SamplingFrequency = 0xB5,
 +  OutputSamplingFrequency = 0x78B5,
 +  Channels = 0x9F,
- //  ChannelPositions = 0x7D7B,
+ /* ChannelPositions = 0x7D7B, */
 -    BitDepth = 0x6264,
 -    /* end audio */
 -    /* content encoding */
 +  BitDepth = 0x6264,
-   // end audio
-   // content encoding
- //  ContentEncodings = 0x6d80,
- //  ContentEncoding = 0x6240,
- //  ContentEncodingOrder = 0x5031,
- //  ContentEncodingScope = 0x5032,
- //  ContentEncodingType = 0x5033,
- //  ContentCompression = 0x5034,
- //  ContentCompAlgo = 0x4254,
- //  ContentCompSettings = 0x4255,
- //  ContentEncryption = 0x5035,
- //  ContentEncAlgo = 0x47e1,
- //  ContentEncKeyID = 0x47e2,
- //  ContentSignature = 0x47e3,
- //  ContentSigKeyID = 0x47e4,
- //  ContentSigAlgo = 0x47e5,
- //  ContentSigHashAlgo = 0x47e6,
-   // end content encoding
-   // Cueing Data
++  /* end audio */
++  /* content encoding */
+ /* ContentEncodings = 0x6d80, */
+ /* ContentEncoding = 0x6240, */
+ /* ContentEncodingOrder = 0x5031, */
+ /* ContentEncodingScope = 0x5032, */
+ /* ContentEncodingType = 0x5033, */
+ /* ContentCompression = 0x5034, */
+ /* ContentCompAlgo = 0x4254, */
+ /* ContentCompSettings = 0x4255, */
+ /* ContentEncryption = 0x5035, */
+ /* ContentEncAlgo = 0x47e1, */
+ /* ContentEncKeyID = 0x47e2, */
+ /* ContentSignature = 0x47e3, */
+ /* ContentSigKeyID = 0x47e4, */
+ /* ContentSigAlgo = 0x47e5, */
+ /* ContentSigHashAlgo = 0x47e6, */
 -    /* end content encoding */
 -    /* Cueing Data */
 -    Cues = 0x1C53BB6B,
 -    CuePoint = 0xBB,
 -    CueTime = 0xB3,
 -    CueTrackPositions = 0xB7,
 -    CueTrack = 0xF7,
 -    CueClusterPosition = 0xF1,
 -    CueBlockNumber = 0x5378
++  /* end content encoding */
++  /* Cueing Data */
 +  Cues = 0x1C53BB6B,
 +  CuePoint = 0xBB,
 +  CueTime = 0xB3,
 +  CueTrackPositions = 0xB7,
 +  CueTrack = 0xF7,
 +  CueClusterPosition = 0xF1,
-   CueBlockNumber = 0x5378,
- //  CueCodecState = 0xEA,
- //  CueReference = 0xDB,
- //  CueRefTime = 0x96,
- //  CueRefCluster = 0x97,
- //  CueRefNumber = 0x535F,
- //  CueRefCodecState = 0xEB,
-   // Attachment
- //  Attachments = 0x1941A469,
- //  AttachedFile = 0x61A7,
- //  FileDescription = 0x467E,
- //  FileName = 0x466E,
- //  FileMimeType = 0x4660,
- //  FileData = 0x465C,
- //  FileUID = 0x46AE,
- //  FileReferral = 0x4675,
-   // Chapters
- //  Chapters = 0x1043A770,
- //  EditionEntry = 0x45B9,
- //  EditionUID = 0x45BC,
- //  EditionFlagHidden = 0x45BD,
- //  EditionFlagDefault = 0x45DB,
- //  EditionFlagOrdered = 0x45DD,
- //  ChapterAtom = 0xB6,
- //  ChapterUID = 0x73C4,
- //  ChapterTimeStart = 0x91,
- //  ChapterTimeEnd = 0x92,
- //  ChapterFlagHidden = 0x98,
- //  ChapterFlagEnabled = 0x4598,
- //  ChapterSegmentUID = 0x6E67,
- //  ChapterSegmentEditionUID = 0x6EBC,
- //  ChapterPhysicalEquiv = 0x63C3,
- //  ChapterTrack = 0x8F,
- //  ChapterTrackNumber = 0x89,
- //  ChapterDisplay = 0x80,
- //  ChapString = 0x85,
- //  ChapLanguage = 0x437C,
- //  ChapCountry = 0x437E,
- //  ChapProcess = 0x6944,
- //  ChapProcessCodecID = 0x6955,
- //  ChapProcessPrivate = 0x450D,
- //  ChapProcessCommand = 0x6911,
- //  ChapProcessTime = 0x6922,
- //  ChapProcessData = 0x6933,
-   // Tagging
- //  Tags = 0x1254C367,
- //  Tag = 0x7373,
- //  Targets = 0x63C0,
- //  TargetTypeValue = 0x68CA,
- //  TargetType = 0x63CA,
- //  Tagging_TrackUID = 0x63C5,
- //  Tagging_EditionUID = 0x63C9,
- //  Tagging_ChapterUID = 0x63C4,
- //  AttachmentUID = 0x63C6,
- //  SimpleTag = 0x67C8,
- //  TagName = 0x45A3,
- //  TagLanguage = 0x447A,
- //  TagDefault = 0x4484,
- //  TagString = 0x4487,
- //  TagBinary = 0x4485,
++  CueBlockNumber = 0x5378
+ /* CueCodecState = 0xEA, */
+ /* CueReference = 0xDB, */
+ /* CueRefTime = 0x96, */
+ /* CueRefCluster = 0x97, */
+ /* CueRefNumber = 0x535F, */
+ /* CueRefCodecState = 0xEB, */
 -    /* Attachment */
++  /* Attachment */
+ /* Attachments = 0x1941A469, */
+ /* AttachedFile = 0x61A7, */
+ /* FileDescription = 0x467E, */
+ /* FileName = 0x466E, */
+ /* FileMimeType = 0x4660, */
+ /* FileData = 0x465C, */
+ /* FileUID = 0x46AE, */
+ /* FileReferral = 0x4675, */
 -    /* Chapters */
++  /* Chapters */
+ /* Chapters = 0x1043A770, */
+ /* EditionEntry = 0x45B9, */
+ /* EditionUID = 0x45BC, */
+ /* EditionFlagHidden = 0x45BD, */
+ /* EditionFlagDefault = 0x45DB, */
+ /* EditionFlagOrdered = 0x45DD, */
+ /* ChapterAtom = 0xB6, */
+ /* ChapterUID = 0x73C4, */
+ /* ChapterTimeStart = 0x91, */
+ /* ChapterTimeEnd = 0x92, */
+ /* ChapterFlagHidden = 0x98, */
+ /* ChapterFlagEnabled = 0x4598, */
+ /* ChapterSegmentUID = 0x6E67, */
+ /* ChapterSegmentEditionUID = 0x6EBC, */
+ /* ChapterPhysicalEquiv = 0x63C3, */
+ /* ChapterTrack = 0x8F, */
+ /* ChapterTrackNumber = 0x89, */
+ /* ChapterDisplay = 0x80, */
+ /* ChapString = 0x85, */
+ /* ChapLanguage = 0x437C, */
+ /* ChapCountry = 0x437E, */
+ /* ChapProcess = 0x6944, */
+ /* ChapProcessCodecID = 0x6955, */
+ /* ChapProcessPrivate = 0x450D, */
+ /* ChapProcessCommand = 0x6911, */
+ /* ChapProcessTime = 0x6922, */
+ /* ChapProcessData = 0x6933, */
 -    /* Tagging */
++  /* Tagging */
+ /* Tags = 0x1254C367, */
+ /* Tag = 0x7373, */
+ /* Targets = 0x63C0, */
+ /* TargetTypeValue = 0x68CA, */
+ /* TargetType = 0x63CA, */
+ /* Tagging_TrackUID = 0x63C5, */
+ /* Tagging_EditionUID = 0x63C9, */
+ /* Tagging_ChapterUID = 0x63C4, */
+ /* AttachmentUID = 0x63C6, */
+ /* SimpleTag = 0x67C8, */
+ /* TagName = 0x45A3, */
+ /* TagLanguage = 0x447A, */
+ /* TagDefault = 0x4484, */
+ /* TagString = 0x4487, */
+ /* TagBinary = 0x4485, */
  };
  #endif
diff --cc libmkv/EbmlWriter.c
index 69039e1bf,d70f06e43..5fc5ed2a3
--- a/libmkv/EbmlWriter.c
+++ b/libmkv/EbmlWriter.c
@@@ -18,136 -18,158 +18,140 @@@
  #define LITERALU64(n) n##LLU
  #endif
  
- void Ebml_WriteLen(EbmlGlobal *glob, long long val) {
-   // TODO check and make sure we are not > than 0x0100000000000000LLU
-   unsigned char size = 8; // size in bytes to output
-   unsigned long long minVal = LITERALU64(0x00000000000000ff); // mask to compare for byte size
 -void Ebml_WriteLen(EbmlGlobal *glob, int64_t val)
 -{
 -    /* TODO check and make sure we are not > than 0x0100000000000000LLU */
 -    unsigned char size = 8; /* size in bytes to output */
++void Ebml_WriteLen(EbmlGlobal *glob, int64_t val) {
++  /* TODO check and make sure we are not > than 0x0100000000000000LLU */
++  unsigned char size = 8; /* size in bytes to output */
+ 
 -    /* mask to compare for byte size */
 -    int64_t minVal = 0xff;
++  /* mask to compare for byte size */
++  int64_t minVal = 0xff;
  
 -    for (size = 1; size < 8; size ++)
 -    {
 -        if (val < minVal)
 -            break;
 +  for (size = 1; size < 8; size ++) {
 +    if (val < minVal)
 +      break;
  
 -        minVal = (minVal << 7);
 -    }
 +    minVal = (minVal << 7);
 +  }
  
-   val |= (LITERALU64(0x000000000000080) << ((size - 1) * 7));
 -    val |= (((uint64_t)0x80) << ((size - 1) * 7));
++  val |= (((uint64_t)0x80) << ((size - 1) * 7));
  
 -    Ebml_Serialize(glob, (void *) &val, sizeof(val), size);
 +  Ebml_Serialize(glob, (void *) &val, sizeof(val), size);
  }
  
 -void Ebml_WriteString(EbmlGlobal *glob, const char *str)
 -{
 -    const size_t size_ = strlen(str);
 -    const uint64_t  size = size_;
 -    Ebml_WriteLen(glob, size);
 -    /* TODO: it's not clear from the spec whether the nul terminator
 -     * should be serialized too.  For now we omit the null terminator.
 -     */
 -    Ebml_Write(glob, str, (unsigned long)size);
 +void Ebml_WriteString(EbmlGlobal *glob, const char *str) {
 +  const size_t size_ = strlen(str);
-   const unsigned long long  size = size_;
++  const uint64_t  size = size_;
 +  Ebml_WriteLen(glob, size);
-   // TODO: it's not clear from the spec whether the nul terminator
-   // should be serialized too.  For now we omit the null terminator.
-   Ebml_Write(glob, str, size);
++  /* TODO: it's not clear from the spec whether the nul terminator
++   * should be serialized too.  For now we omit the null terminator.
++   */
++  Ebml_Write(glob, str, (unsigned long)size);
  }
  
 -void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr)
 -{
 -    const size_t strlen = wcslen(wstr);
 +void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr) {
 +  const size_t strlen = wcslen(wstr);
  
-   // TODO: it's not clear from the spec whether the nul terminator
-   // should be serialized too.  For now we include it.
-   const unsigned long long  size = strlen;
 -    /* TODO: it's not clear from the spec whether the nul terminator
 -     * should be serialized too.  For now we include it.
 -     */
 -    const uint64_t  size = strlen;
++  /* TODO: it's not clear from the spec whether the nul terminator
++   * should be serialized too.  For now we include it.
++   */
++  const uint64_t  size = strlen;
  
 -    Ebml_WriteLen(glob, size);
 -    Ebml_Write(glob, wstr, (unsigned long)size);
 +  Ebml_WriteLen(glob, size);
-   Ebml_Write(glob, wstr, size);
++  Ebml_Write(glob, wstr, (unsigned long)size);
  }
  
 -void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id)
 -{
 -    int len;
 +void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id) {
 +  int len;
  
 -    if (class_id >= 0x01000000)
 -        len = 4;
 -    else if (class_id >= 0x00010000)
 -        len = 3;
 -    else if (class_id >= 0x00000100)
 -        len = 2;
 -    else
 -        len = 1;
 +  if (class_id >= 0x01000000)
 +    len = 4;
 +  else if (class_id >= 0x00010000)
 +    len = 3;
 +  else if (class_id >= 0x00000100)
 +    len = 2;
 +  else
 +    len = 1;
  
 -    Ebml_Serialize(glob, (void *)&class_id, sizeof(class_id), len);
 +  Ebml_Serialize(glob, (void *)&class_id, sizeof(class_id), len);
  }
  
 -void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui)
 -{
 -    unsigned char sizeSerialized = 8 | 0x80;
 -    Ebml_WriteID(glob, class_id);
 -    Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
 -    Ebml_Serialize(glob, &ui, sizeof(ui), 8);
 +void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui) {
 +  unsigned char sizeSerialized = 8 | 0x80;
 +  Ebml_WriteID(glob, class_id);
 +  Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
 +  Ebml_Serialize(glob, &ui, sizeof(ui), 8);
  }
  
 -void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui)
 -{
 -    unsigned char size = 8; /* size in bytes to output */
 -    unsigned char sizeSerialized = 0;
 -    unsigned long minVal;
 -
 -    Ebml_WriteID(glob, class_id);
 -    minVal = 0x7fLU; /* mask to compare for byte size */
 +void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui) {
-   unsigned char size = 8; // size in bytes to output
++  unsigned char size = 8; /* size in bytes to output */
 +  unsigned char sizeSerialized = 0;
 +  unsigned long minVal;
  
 -    for (size = 1; size < 4; size ++)
 -    {
 -        if (ui < minVal)
 -        {
 -            break;
 -        }
 +  Ebml_WriteID(glob, class_id);
-   minVal = 0x7fLU; // mask to compare for byte size
++  minVal = 0x7fLU; /* mask to compare for byte size */
  
 -        minVal <<= 7;
 +  for (size = 1; size < 4; size ++) {
 +    if (ui < minVal) {
 +      break;
      }
  
 -    sizeSerialized = 0x80 | size;
 -    Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
 -    Ebml_Serialize(glob, &ui, sizeof(ui), size);
 +    minVal <<= 7;
 +  }
 +
 +  sizeSerialized = 0x80 | size;
 +  Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
 +  Ebml_Serialize(glob, &ui, sizeof(ui), size);
  }
- // TODO: perhaps this is a poor name for this id serializer helper function
+ /* TODO: perhaps this is a poor name for this id serializer helper function */
 -void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin)
 -{
 -    int size;
 -    for (size=4; size > 1; size--)
 -    {
 -        if (bin & 0x000000ff << ((size-1) * 8))
 -            break;
 -    }
 -    Ebml_WriteID(glob, class_id);
 -    Ebml_WriteLen(glob, size);
 -    Ebml_WriteID(glob, bin);
 +void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin) {
 +  int size;
 +  for (size = 4; size > 1; size--) {
 +    if (bin & 0x000000ff << ((size - 1) * 8))
 +      break;
 +  }
 +  Ebml_WriteID(glob, class_id);
 +  Ebml_WriteLen(glob, size);
 +  Ebml_WriteID(glob, bin);
  }
  
 -void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d)
 -{
 -    unsigned char len = 0x88;
 +void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d) {
 +  unsigned char len = 0x88;
  
 -    Ebml_WriteID(glob, class_id);
 -    Ebml_Serialize(glob, &len, sizeof(len), 1);
 -    Ebml_Serialize(glob,  &d, sizeof(d), 8);
 +  Ebml_WriteID(glob, class_id);
 +  Ebml_Serialize(glob, &len, sizeof(len), 1);
 +  Ebml_Serialize(glob,  &d, sizeof(d), 8);
  }
  
 -void Ebml_WriteSigned16(EbmlGlobal *glob, short val)
 -{
 -    signed long out = ((val & 0x003FFFFF) | 0x00200000) << 8;
 -    Ebml_Serialize(glob, &out, sizeof(out), 3);
 +void Ebml_WriteSigned16(EbmlGlobal *glob, short val) {
 +  signed long out = ((val & 0x003FFFFF) | 0x00200000) << 8;
 +  Ebml_Serialize(glob, &out, sizeof(out), 3);
  }
  
 -void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s)
 -{
 -    Ebml_WriteID(glob, class_id);
 -    Ebml_WriteString(glob, s);
 +void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s) {
 +  Ebml_WriteID(glob, class_id);
 +  Ebml_WriteString(glob, s);
  }
  
 -void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s)
 -{
 -    Ebml_WriteID(glob,  class_id);
 -    Ebml_WriteUTF8(glob,  s);
 +void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s) {
 +  Ebml_WriteID(glob,  class_id);
 +  Ebml_WriteUTF8(glob,  s);
  }
  
 -void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length)
 -{
 -    Ebml_WriteID(glob, class_id);
 -    Ebml_WriteLen(glob, data_length);
 -    Ebml_Write(glob,  data, data_length);
 +void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length) {
 +  Ebml_WriteID(glob, class_id);
 +  Ebml_WriteLen(glob, data_length);
 +  Ebml_Write(glob,  data, data_length);
  }
  
 -void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize)
 -{
 -    unsigned char tmp = 0;
 -    unsigned long i = 0;
 +void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize) {
 +  unsigned char tmp = 0;
 +  unsigned long i = 0;
  
 -    Ebml_WriteID(glob, 0xEC);
 -    Ebml_WriteLen(glob, vSize);
 +  Ebml_WriteID(glob, 0xEC);
 +  Ebml_WriteLen(glob, vSize);
  
 -    for (i = 0; i < vSize; i++)
 -    {
 -        Ebml_Write(glob, &tmp, 1);
 -    }
 +  for (i = 0; i < vSize; i++) {
 +    Ebml_Write(glob, &tmp, 1);
 +  }
  }
  
- // TODO Serialize Date
+ /* TODO Serialize Date */
diff --cc libs.mk
index abb7a8e3d,373c1cd44..9af6a35c7
--- a/libs.mk
+++ b/libs.mk
@@@ -17,6 -17,6 +17,34 @@@ els
    ASM:=.asm
  endif
  
++
++#
++# Calculate platform- and compiler-specific offsets for hand coded assembly
++#
++ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
++OFFSET_PATTERN:='^[a-zA-Z0-9_]* EQU'
++define asm_offsets_template
++$$(BUILD_PFX)$(1): $$(BUILD_PFX)$(2).S
++	@echo "    [CREATE] $$@"
++	$$(qexec)LC_ALL=C grep $$(OFFSET_PATTERN) $$< | tr -d '$$$$\#' $$(ADS2GAS) > $$@
++$$(BUILD_PFX)$(2).S: $(2)
++CLEAN-OBJS += $$(BUILD_PFX)$(1) $(2).S
++endef
++else
++  ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))
++define asm_offsets_template
++$$(BUILD_PFX)$(1): obj_int_extract
++$$(BUILD_PFX)$(1): $$(BUILD_PFX)$(2).o
++	@echo "    [CREATE] $$@"
++	$$(qexec)./obj_int_extract rvds $$< $$(ADS2GAS) > $$@
++OBJS-yes += $$(BUILD_PFX)$(2).o
++CLEAN-OBJS += $$(BUILD_PFX)$(1)
++$$(filter %$$(ASM).o,$$(OBJS-yes)): $$(BUILD_PFX)$(1)
++endef
++endif # rvct
++endif # !gcc
++
++
  CODEC_SRCS-yes += CHANGELOG
  CODEC_SRCS-yes += libs.mk
  
@@@ -29,32 -29,32 +57,64 @@@ CODEC_SRCS-yes += $(addprefix vpx_mem/,
  include $(SRC_PATH_BARE)/vpx_scale/vpx_scale.mk
  CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS))
  
++ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
++  VP8_PREFIX=vp8/
++  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
++endif
+ 
+ ifeq ($(CONFIG_VP8_ENCODER),yes)
 -  VP8_PREFIX=vp8/
+   include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx.mk
+   CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
+   CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))
+   CODEC_SRCS-yes += $(VP8_PREFIX)vp8cx.mk vpx/vp8.h vpx/vp8cx.h
 -  CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8cx_arm.mk
++  CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp88cx_arm.mk
+   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
+   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
+   CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
+   CODEC_DOC_SECTIONS += vp8 vp8_encoder
+ endif
+ 
+ ifeq ($(CONFIG_VP8_DECODER),yes)
 -  VP8_PREFIX=vp8/
+   include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8dx.mk
+   CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS))
+   CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS))
+   CODEC_SRCS-yes += $(VP8_PREFIX)vp8dx.mk vpx/vp8.h vpx/vp8dx.h
+   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
+   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
+   CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
+   CODEC_DOC_SECTIONS += vp8 vp8_decoder
+ endif
+ 
++ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),)
++  VP9_PREFIX=vp9/
++  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk
++endif
 +
 +ifeq ($(CONFIG_VP9_ENCODER),yes)
 +  VP9_PREFIX=vp9/
 +  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9cx.mk
 +  CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
 +  CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))
-   CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h vpx/vp8e.h
++  CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
 +  CODEC_SRCS-$(ARCH_ARM) += $(VP9_PREFIX)vp98cx_arm.mk
-   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8e.h include/vpx/vp8cx.h
++  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
 +  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
 +  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
 +  CODEC_DOC_SECTIONS += vp9 vp9_encoder
 +endif
 +
 +ifeq ($(CONFIG_VP9_DECODER),yes)
 +  VP9_PREFIX=vp9/
 +  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9dx.mk
 +  CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_DX_SRCS))
 +  CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_DX_EXPORTS))
 +  CODEC_SRCS-yes += $(VP9_PREFIX)vp9dx.mk vpx/vp8.h vpx/vp8dx.h
 +  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
 +  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
 +  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
 +  CODEC_DOC_SECTIONS += vp9 vp9_decoder
 +endif
 +
  
  ifeq ($(CONFIG_ENCODERS),yes)
    CODEC_DOC_SECTIONS += encoder
@@@ -172,9 -170,9 +231,9 @@@ CLEAN-OBJS += vpx.de
  
  vpx.vcproj: $(CODEC_SRCS) vpx.def
  	@echo "    [CREATE] $@"
- 	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+ 	$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
 -			--lib \
 -			--target=$(TOOLCHAIN) \
 +            $(if $(CONFIG_SHARED),--dll,--lib) \
 +            --target=$(TOOLCHAIN) \
              $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
              --name=vpx \
              --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74 \
@@@ -263,7 -242,8 +322,8 @@@ vpx.pc: config.mk libs.m
  	$(qexec)echo 'Version: $(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)' >> $@
  	$(qexec)echo 'Requires:' >> $@
  	$(qexec)echo 'Conflicts:' >> $@
 -	$(qexec)echo 'Libs: -L$${libdir} -lvpx' >> $@
 +	$(qexec)echo 'Libs: -L$${libdir} -lvpx -lm' >> $@
+ 	$(qexec)echo 'Libs.private: -lm -lpthread' >> $@
  	$(qexec)echo 'Cflags: -I$${includedir}' >> $@
  INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/vpx.pc
  INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
@@@ -298,57 -278,57 +358,6 @@@ endi
  $(filter %.s.o,$(OBJS-yes)):     $(BUILD_PFX)vpx_config.asm
  $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
  
--#
--# Calculate platform- and compiler-specific offsets for hand coded assembly
--#
--
--OFFSET_PATTERN:='^[a-zA-Z0-9_]* EQU'
--
--ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
-     $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S
 -    $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
--	@echo "    [CREATE] $@"
--	$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
-     $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S: $(VP9_PREFIX)common/asm_com_offsets.c
-     CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S
 -    $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S: $(VP8_PREFIX)common/asm_com_offsets.c
 -    CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
--
-     $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S
 -    $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
--	@echo "    [CREATE] $@"
--	$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
-     $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S: $(VP9_PREFIX)encoder/asm_enc_offsets.c
-     CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S
 -    $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S: $(VP8_PREFIX)encoder/asm_enc_offsets.c
 -    CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
--
-     $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S
 -    $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
--	@echo "    [CREATE] $@"
--	$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
-     $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S: $(VP9_PREFIX)decoder/asm_dec_offsets.c
-     CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S
 -    $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S: $(VP8_PREFIX)decoder/asm_dec_offsets.c
 -    CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
--else
--  ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))
--    asm_com_offsets.asm: obj_int_extract
-     asm_com_offsets.asm: $(VP9_PREFIX)common/asm_com_offsets.c.o
 -    asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
--	@echo "    [CREATE] $@"
--	$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@
-     OBJS-yes += $(VP9_PREFIX)common/asm_com_offsets.c.o
 -    OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
--    CLEAN-OBJS += asm_com_offsets.asm
--    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
--
--    asm_enc_offsets.asm: obj_int_extract
-     asm_enc_offsets.asm: $(VP9_PREFIX)encoder/asm_enc_offsets.c.o
 -    asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
--	@echo "    [CREATE] $@"
--	$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@
-     OBJS-yes += $(VP9_PREFIX)encoder/asm_enc_offsets.c.o
 -    OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
--    CLEAN-OBJS += asm_enc_offsets.asm
--    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
--
--    asm_dec_offsets.asm: obj_int_extract
-     asm_dec_offsets.asm: $(VP9_PREFIX)decoder/asm_dec_offsets.c.o
 -    asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
--	@echo "    [CREATE] $@"
--	$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@
-     OBJS-yes += $(VP9_PREFIX)decoder/asm_dec_offsets.c.o
 -    OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
--    CLEAN-OBJS += asm_dec_offsets.asm
--    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
--  endif
--endif
  
  $(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h)
  CLEAN-OBJS += $(BUILD_PFX)vpx_version.h
@@@ -356,15 -336,14 +365,15 @@@
  #
  # Rule to generate runtime cpu detection files
  #
- $(OBJS-yes:.o=.d): vpx_rtcd.h
- vpx_rtcd.h: $(sort $(filter %rtcd_defs.sh,$(CODEC_SRCS)))
+ $(BUILD_PFX)vpx_rtcd.h: $(SRC_PATH_BARE)/$(sort $(filter %rtcd_defs.sh,$(CODEC_SRCS)))
  	@echo "    [CREATE] $@"
  	$(qexec)$(SRC_PATH_BARE)/build/make/rtcd.sh --arch=$(TGT_ISA) \
- 		  --sym=vpx_rtcd \
- 		  --config=$(target)$(if $(FAT_ARCHS),,-$(TOOLCHAIN)).mk \
- 		  $(RTCD_OPTIONS) $^ > $@
+           --sym=vpx_rtcd \
+           --config=$(target)$(if $(FAT_ARCHS),,-$(TOOLCHAIN)).mk \
+           $(RTCD_OPTIONS) $^ > $@
  CLEAN-OBJS += $(BUILD_PFX)vpx_rtcd.h
  
++
  CODEC_DOC_SRCS += vpx/vpx_codec.h \
                    vpx/vpx_decoder.h \
                    vpx/vpx_encoder.h \
diff --cc test/test.mk
index 129c18862,7a11a2793..3c6d44c97
--- a/test/test.mk
+++ b/test/test.mk
@@@ -1,10 -1,178 +1,186 @@@
- LIBVPX_TEST_SRCS-yes += test.mk
  LIBVPX_TEST_SRCS-yes += acm_random.h
- LIBVPX_TEST_SRCS-yes += boolcoder_test.cc
- LIBVPX_TEST_SRCS-yes += dct16x16_test.cc
- LIBVPX_TEST_SRCS-yes += fdct4x4_test.cc
- LIBVPX_TEST_SRCS-yes += fdct8x8_test.cc
- LIBVPX_TEST_SRCS-yes += idct8x8_test.cc
+ LIBVPX_TEST_SRCS-yes += test.mk
  LIBVPX_TEST_SRCS-yes += test_libvpx.cc
+ LIBVPX_TEST_SRCS-yes += util.h
+ LIBVPX_TEST_SRCS-yes += video_source.h
+ 
+ ##
+ ## BLACK BOX TESTS
+ ##
+ ## Black box tests only use the public API.
+ ##
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += altref_test.cc
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += datarate_test.cc
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += encode_test_driver.cc
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += encode_test_driver.h
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += error_resilience_test.cc
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += i420_video_source.h
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc
+ 
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += ../md5_utils.h ../md5_utils.c
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += decode_test_driver.cc
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += decode_test_driver.h
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += ivf_video_source.h
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += test_vector_test.cc
+ ##
+ ## WHITE BOX TESTS
+ ##
+ ## Whitebox tests invoke functions not exposed via the public API. Certain
+ ## shared library builds don't make these functions accessible.
+ ##
+ ifeq ($(CONFIG_SHARED),)
+ 
+ # These tests require both the encoder and decoder to be built.
+ ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
+ LIBVPX_TEST_SRCS-yes                   += boolcoder_test.cc
+ endif
+ 
 -LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += fdct4x4_test.cc
+ LIBVPX_TEST_SRCS-yes                   += idctllm_test.cc
+ LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc
+ LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
+ LIBVPX_TEST_SRCS-yes                   += sad_test.cc
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
+ LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
+ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
++LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
++
++# VP9 tests
++LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
++LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
++LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
++ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),)
++LIBVPX_TEST_SRCS-yes += idct8x8_test.cc
++endif
+ 
+ endif
+ 
  
- LIBVPX_TEST_DATA-yes += hantro_collage_w352h288.yuv
+ ##
+ ## TEST DATA
+ ##
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_ENCODER) += hantro_collage_w352h288.yuv
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf.md5
+ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf.md5
diff --cc test/test_libvpx.cc
index 924aa2e2e,cfd5d2807..2b9b0c21f
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@@ -26,18 -26,18 +26,18 @@@ int main(int argc, char **argv) 
    ::testing::InitGoogleTest(&argc, argv);
  
  #if ARCH_X86 || ARCH_X86_64
-   int simd_caps = x86_simd_caps();
+   const int simd_caps = x86_simd_caps();
 -  if(!(simd_caps & HAS_MMX))
 +  if (!(simd_caps & HAS_MMX))
      append_gtest_filter(":-MMX/*");
 -  if(!(simd_caps & HAS_SSE))
 +  if (!(simd_caps & HAS_SSE))
      append_gtest_filter(":-SSE/*");
 -  if(!(simd_caps & HAS_SSE2))
 +  if (!(simd_caps & HAS_SSE2))
      append_gtest_filter(":-SSE2/*");
 -  if(!(simd_caps & HAS_SSE3))
 +  if (!(simd_caps & HAS_SSE3))
      append_gtest_filter(":-SSE3/*");
 -  if(!(simd_caps & HAS_SSSE3))
 +  if (!(simd_caps & HAS_SSSE3))
      append_gtest_filter(":-SSSE3/*");
 -  if(!(simd_caps & HAS_SSE4_1))
 +  if (!(simd_caps & HAS_SSE4_1))
      append_gtest_filter(":-SSE4_1/*");
  #endif
  
diff --cc test/vp8_fdct4x4_test.cc
index 000000000,000000000..619b23d22
new file mode 100644
--- /dev/null
+++ b/test/vp8_fdct4x4_test.cc
@@@ -1,0 -1,0 +1,169 @@@
++/*
++*  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
++*
++*  Use of this source code is governed by a BSD-style license
++*  that can be found in the LICENSE file in the root of the source
++*  tree. An additional intellectual property rights grant can be found
++*  in the file PATENTS.  All contributing project authors may
++*  be found in the AUTHORS file in the root of the source tree.
++*/
++
++
++#include <math.h>
++#include <stddef.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <sys/types.h>
++
++
++extern "C" {
++#include "vpx_rtcd.h"
++}
++
++#include "test/acm_random.h"
++#include "third_party/googletest/src/include/gtest/gtest.h"
++#include "vpx/vpx_integer.h"
++
++
++namespace {
++
++const int cospi8sqrt2minus1 = 20091;
++const int sinpi8sqrt2 = 35468;
++
++void reference_idct4x4(const int16_t *input, int16_t *output) {
++  const int16_t *ip = input;
++  int16_t *op = output;
++
++  for (int i = 0; i < 4; ++i) {
++    const int a1 = ip[0] + ip[8];
++    const int b1 = ip[0] - ip[8];
++    const int temp1 = (ip[4] * sinpi8sqrt2) >> 16;
++    const int temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
++    const int c1 = temp1 - temp2;
++    const int temp3 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
++    const int temp4 = (ip[12] * sinpi8sqrt2) >> 16;
++    const int d1 = temp3 + temp4;
++    op[0] = a1 + d1;
++    op[12] = a1 - d1;
++    op[4] = b1 + c1;
++    op[8] = b1 - c1;
++    ++ip;
++    ++op;
++  }
++  ip = output;
++  op = output;
++  for (int i = 0; i < 4; ++i) {
++    const int a1 = ip[0] + ip[2];
++    const int b1 = ip[0] - ip[2];
++    const int temp1 = (ip[1] * sinpi8sqrt2) >> 16;
++    const int temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
++    const int c1 = temp1 - temp2;
++    const int temp3 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
++    const int temp4 = (ip[3] * sinpi8sqrt2) >> 16;
++    const int d1 = temp3 + temp4;
++    op[0] = (a1 + d1 + 4) >> 3;
++    op[3] = (a1 - d1 + 4) >> 3;
++    op[1] = (b1 + c1 + 4) >> 3;
++    op[2] = (b1 - c1 + 4) >> 3;
++    ip += 4;
++    op += 4;
++  }
++}
++
++using libvpx_test::ACMRandom;
++
++TEST(Vp8FdctTest, SignBiasCheck) {
++  ACMRandom rnd(ACMRandom::DeterministicSeed());
++  int16_t test_input_block[16];
++  int16_t test_output_block[16];
++  const int pitch = 8;
++  int count_sign_block[16][2];
++  const int count_test_block = 1000000;
++
++  memset(count_sign_block, 0, sizeof(count_sign_block));
++
++  for (int i = 0; i < count_test_block; ++i) {
++    // Initialize a test block with input range [-255, 255].
++    for (int j = 0; j < 16; ++j)
++      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
++
++    vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch);
++
++    for (int j = 0; j < 16; ++j) {
++      if (test_output_block[j] < 0)
++        ++count_sign_block[j][0];
++      else if (test_output_block[j] > 0)
++        ++count_sign_block[j][1];
++    }
++  }
++
++  bool bias_acceptable = true;
++  for (int j = 0; j < 16; ++j)
++    bias_acceptable = bias_acceptable &&
++    (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 10000);
++
++  EXPECT_EQ(true, bias_acceptable)
++    << "Error: 4x4 FDCT has a sign bias > 1% for input range [-255, 255]";
++
++  memset(count_sign_block, 0, sizeof(count_sign_block));
++
++  for (int i = 0; i < count_test_block; ++i) {
++    // Initialize a test block with input range [-15, 15].
++    for (int j = 0; j < 16; ++j)
++      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
++
++    vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch);
++
++    for (int j = 0; j < 16; ++j) {
++      if (test_output_block[j] < 0)
++        ++count_sign_block[j][0];
++      else if (test_output_block[j] > 0)
++        ++count_sign_block[j][1];
++    }
++  }
++
++  bias_acceptable = true;
++  for (int j = 0; j < 16; ++j)
++    bias_acceptable = bias_acceptable &&
++    (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 100000);
++
++  EXPECT_EQ(true, bias_acceptable)
++    << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]";
++};
++
++TEST(Vp8FdctTest, RoundTripErrorCheck) {
++  ACMRandom rnd(ACMRandom::DeterministicSeed());
++  int max_error = 0;
++  double total_error = 0;
++  const int count_test_block = 1000000;
++  for (int i = 0; i < count_test_block; ++i) {
++    int16_t test_input_block[16];
++    int16_t test_temp_block[16];
++    int16_t test_output_block[16];
++
++    // Initialize a test block with input range [-255, 255].
++    for (int j = 0; j < 16; ++j)
++      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
++
++    const int pitch = 8;
++    vp8_short_fdct4x4_c(test_input_block, test_temp_block, pitch);
++    reference_idct4x4(test_temp_block, test_output_block);
++
++    for (int j = 0; j < 16; ++j) {
++      const int diff = test_input_block[j] - test_output_block[j];
++      const int error = diff * diff;
++      if (max_error < error)
++        max_error = error;
++      total_error += error;
++    }
++  }
++
++  EXPECT_GE(1, max_error )
++    << "Error: FDCT/IDCT has an individual roundtrip error > 1";
++
++  EXPECT_GE(count_test_block, total_error)
++    << "Error: FDCT/IDCT has average roundtrip error > 1 per block";
++};
++
++}  // namespace
diff --cc tools/all_builds.py
index d1f0c80c0,000000000..78581d9f0
mode 100755,000000..100755
--- a/tools/all_builds.py
+++ b/tools/all_builds.py
@@@ -1,72 -1,0 +1,72 @@@
 +#!/usr/bin/python
 +
 +import getopt
 +import subprocess
 +import sys
 +
 +LONG_OPTIONS = ["shard=", "shards="]
- BASE_COMMAND = "./configure --enable-internal-stats --enable-experimental"
++BASE_COMMAND = "./configure --disable-vp8 --disable-unit-tests --enable-internal-stats --enable-experimental"
 +
 +def RunCommand(command):
 +  run = subprocess.Popen(command, shell=True)
 +  output = run.communicate()
 +  if run.returncode:
 +    print "Non-zero return code: " + str(run.returncode) + " => exiting!"
 +    sys.exit(1)
 +
 +def list_of_experiments():
 +  experiments = []
 +  configure_file = open("configure")
 +  list_start = False
 +  for line in configure_file.read().split("\n"):
 +    if line == 'EXPERIMENT_LIST="':
 +      list_start = True
 +    elif line == '"':
 +      list_start = False
 +    elif list_start:
 +      currently_broken = ["csm"]
 +      experiment = line[4:]
 +      if experiment not in currently_broken:
 +        experiments.append(experiment)
 +  return experiments
 +
 +def main(argv):
 +  # Parse arguments
 +  options = {"--shard": 0, "--shards": 1}
 +  if "--" in argv:
 +    opt_end_index = argv.index("--")
 +  else:
 +    opt_end_index = len(argv)
 +  try:
 +    o, _ = getopt.getopt(argv[1:opt_end_index], None, LONG_OPTIONS)
 +  except getopt.GetoptError, err:
 +    print str(err)
 +    print "Usage: %s [--shard=<n> --shards=<n>] -- [configure flag ...]"%argv[0]
 +    sys.exit(2)
 +
 +  options.update(o)
 +  extra_args = argv[opt_end_index + 1:]
 +
 +  # Shard experiment list
 +  shard = int(options["--shard"])
 +  shards = int(options["--shards"])
 +  experiments = list_of_experiments()
 +  base_command = " ".join([BASE_COMMAND] + extra_args)
 +  configs = [base_command]
 +  configs += ["%s --enable-%s" % (base_command, e) for e in experiments]
 +  my_configs = zip(configs, range(len(configs)))
 +  my_configs = filter(lambda x: x[1] % shards == shard, my_configs)
 +  my_configs = [e[0] for e in my_configs]
 +
 +  # Run configs for this shard
 +  for config in my_configs:
 +    test_build(config)
 +
 +def test_build(configure_command):
 +  print "\033[34m\033[47mTesting %s\033[0m" % (configure_command)
 +  RunCommand(configure_command)
 +  RunCommand("make clean")
 +  RunCommand("make")
 +
 +if __name__ == "__main__":
 +  main(sys.argv)
diff --cc vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
index 000000000,a644a004c..4abe818f1
mode 000000,100644..100644
--- a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
@@@ -1,0 -1,310 +1,310 @@@
+ ;
+ ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ ;
+ ;  Use of this source code is governed by a BSD-style license
+ ;  that can be found in the LICENSE file in the root of the source
+ ;  tree. An additional intellectual property rights grant can be found
+ ;  in the file PATENTS.  All contributing project authors may
+ ;  be found in the AUTHORS file in the root of the source tree.
+ ;
+ 
+ 
+     EXPORT |vp8_start_encode|
+     EXPORT |vp8_encode_bool|
+     EXPORT |vp8_stop_encode|
+     EXPORT |vp8_encode_value|
+     IMPORT |vp8_validate_buffer_arm|
+ 
 -    INCLUDE asm_enc_offsets.asm
++    INCLUDE vp8_asm_enc_offsets.asm
+ 
+     ARM
+     REQUIRE8
+     PRESERVE8
+ 
+     AREA    |.text|, CODE, READONLY
+ 
+     ; macro for validating write buffer position
+     ; needs vp8_writer in r0
+     ; start shall not be in r1
+     MACRO
+     VALIDATE_POS $start, $pos
+     push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+     ldr  r2, [r0, #vp8_writer_buffer_end]
+     ldr  r3, [r0, #vp8_writer_error]
+     mov  r1, $pos
+     mov  r0, $start
+     bl   vp8_validate_buffer_arm
+     pop  {r0-r3, r12, lr}
+     MEND
+ 
+ ; r0 BOOL_CODER *br
+ ; r1 unsigned char *source
+ ; r2 unsigned char *source_end
+ |vp8_start_encode| PROC
+     str     r2,  [r0, #vp8_writer_buffer_end]
+     mov     r12, #0
+     mov     r3,  #255
+     mvn     r2,  #23
+     str     r12, [r0, #vp8_writer_lowvalue]
+     str     r3,  [r0, #vp8_writer_range]
+     str     r2,  [r0, #vp8_writer_count]
+     str     r12, [r0, #vp8_writer_pos]
+     str     r1,  [r0, #vp8_writer_buffer]
+     bx      lr
+     ENDP
+ 
+ ; r0 BOOL_CODER *br
+ ; r1 int bit
+ ; r2 int probability
+ |vp8_encode_bool| PROC
+     push    {r4-r10, lr}
+ 
+     mov     r4, r2
+ 
+     ldr     r2, [r0, #vp8_writer_lowvalue]
+     ldr     r5, [r0, #vp8_writer_range]
+     ldr     r3, [r0, #vp8_writer_count]
+ 
+     sub     r7, r5, #1                  ; range-1
+ 
+     cmp     r1, #0
+     mul     r6, r4, r7                  ; ((range-1) * probability)
+ 
+     mov     r7, #1
+     add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * probability) >> 8)
+ 
+     addne   r2, r2, r4                  ; if  (bit) lowvalue += split
+     subne   r4, r5, r4                  ; if  (bit) range = range-split
+ 
+     ; Counting the leading zeros is used to normalize range.
+     clz     r6, r4
+     sub     r6, r6, #24                 ; shift
+ 
+     ; Flag is set on the sum of count.  This flag is used later
+     ; to determine if count >= 0
+     adds    r3, r3, r6                  ; count += shift
+     lsl     r5, r4, r6                  ; range <<= shift
+     bmi     token_count_lt_zero         ; if(count >= 0)
+ 
+     sub     r6, r6, r3                  ; offset = shift - count
+     sub     r4, r6, #1                  ; offset-1
+     lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+     bpl     token_high_bit_not_set
+ 
+     ldr     r4, [r0, #vp8_writer_pos]   ; x
+     sub     r4, r4, #1                  ; x = w->pos-1
+     b       token_zero_while_start
+ token_zero_while_loop
+     mov     r9, #0
+     strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
+     sub     r4, r4, #1                  ; x--
+ token_zero_while_start
+     cmp     r4, #0
+     ldrge   r7, [r0, #vp8_writer_buffer]
+     ldrb    r1, [r7, r4]
+     cmpge   r1, #0xff
+     beq     token_zero_while_loop
+ 
+     ldr     r7, [r0, #vp8_writer_buffer]
+     ldrb    r9, [r7, r4]                ; w->buffer[x]
+     add     r9, r9, #1
+     strb    r9, [r7, r4]                ; w->buffer[x] + 1
+ token_high_bit_not_set
+     rsb     r4, r6, #24                 ; 24-offset
+     ldr     r9, [r0, #vp8_writer_buffer]
+     lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+     ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+     lsl     r2, r2, r6                  ; lowvalue <<= offset
+     mov     r6, r3                      ; shift = count
+     add     r1, r4, #1                  ; w->pos++
+     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+     str     r1, [r0, #vp8_writer_pos]
+     sub     r3, r3, #8                  ; count -= 8
+ 
+     VALIDATE_POS r9, r1                 ; validate_buffer at pos
+ 
+     strb    r7, [r9, r4]                ; w->buffer[w->pos++]
+ 
+ token_count_lt_zero
+     lsl     r2, r2, r6                  ; lowvalue <<= shift
+ 
+     str     r2, [r0, #vp8_writer_lowvalue]
+     str     r5, [r0, #vp8_writer_range]
+     str     r3, [r0, #vp8_writer_count]
+     pop     {r4-r10, pc}
+     ENDP
+ 
+ ; r0 BOOL_CODER *br
+ |vp8_stop_encode| PROC
+     push    {r4-r10, lr}
+ 
+     ldr     r2, [r0, #vp8_writer_lowvalue]
+     ldr     r5, [r0, #vp8_writer_range]
+     ldr     r3, [r0, #vp8_writer_count]
+ 
+     mov     r10, #32
+ 
+ stop_encode_loop
+     sub     r7, r5, #1                  ; range-1
+ 
+     mov     r4, r7, lsl #7              ; ((range-1) * 128)
+ 
+     mov     r7, #1
+     add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
+ 
+     ; Counting the leading zeros is used to normalize range.
+     clz     r6, r4
+     sub     r6, r6, #24                 ; shift
+ 
+     ; Flag is set on the sum of count.  This flag is used later
+     ; to determine if count >= 0
+     adds    r3, r3, r6                  ; count += shift
+     lsl     r5, r4, r6                  ; range <<= shift
+     bmi     token_count_lt_zero_se      ; if(count >= 0)
+ 
+     sub     r6, r6, r3                  ; offset = shift - count
+     sub     r4, r6, #1                  ; offset-1
+     lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+     bpl     token_high_bit_not_set_se
+ 
+     ldr     r4, [r0, #vp8_writer_pos]   ; x
+     sub     r4, r4, #1                  ; x = w->pos-1
+     b       token_zero_while_start_se
+ token_zero_while_loop_se
+     mov     r9, #0
+     strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
+     sub     r4, r4, #1                  ; x--
+ token_zero_while_start_se
+     cmp     r4, #0
+     ldrge   r7, [r0, #vp8_writer_buffer]
+     ldrb    r1, [r7, r4]
+     cmpge   r1, #0xff
+     beq     token_zero_while_loop_se
+ 
+     ldr     r7, [r0, #vp8_writer_buffer]
+     ldrb    r9, [r7, r4]                ; w->buffer[x]
+     add     r9, r9, #1
+     strb    r9, [r7, r4]                ; w->buffer[x] + 1
+ token_high_bit_not_set_se
+     rsb     r4, r6, #24                 ; 24-offset
+     ldr     r9, [r0, #vp8_writer_buffer]
+     lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+     ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+     lsl     r2, r2, r6                  ; lowvalue <<= offset
+     mov     r6, r3                      ; shift = count
+     add     r1, r4, #1                  ; w->pos++
+     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+     str     r1, [r0, #vp8_writer_pos]
+     sub     r3, r3, #8                  ; count -= 8
+ 
+     VALIDATE_POS r9, r1                 ; validate_buffer at pos
+ 
+     strb    r7, [r9, r4]                ; w->buffer[w->pos++]
+ 
+ token_count_lt_zero_se
+     lsl     r2, r2, r6                  ; lowvalue <<= shift
+ 
+     subs    r10, r10, #1
+     bne     stop_encode_loop
+ 
+     str     r2, [r0, #vp8_writer_lowvalue]
+     str     r5, [r0, #vp8_writer_range]
+     str     r3, [r0, #vp8_writer_count]
+     pop     {r4-r10, pc}
+ 
+     ENDP
+ 
+ ; r0 BOOL_CODER *br
+ ; r1 int data
+ ; r2 int bits
+ |vp8_encode_value| PROC
+     push    {r4-r12, lr}
+ 
+     mov     r10, r2
+ 
+     ldr     r2, [r0, #vp8_writer_lowvalue]
+     ldr     r5, [r0, #vp8_writer_range]
+     ldr     r3, [r0, #vp8_writer_count]
+ 
+     rsb     r4, r10, #32                 ; 32-n
+ 
+     ; v is kept in r1 during the token pack loop
+     lsl     r1, r1, r4                  ; r1 = v << 32 - n
+ 
+ encode_value_loop
+     sub     r7, r5, #1                  ; range-1
+ 
+     ; Decisions are made based on the bit value shifted
+     ; off of v, so set a flag here based on this.
+     ; This value is refered to as "bb"
+     lsls    r1, r1, #1                  ; bit = v >> n
+     mov     r4, r7, lsl #7              ; ((range-1) * 128)
+ 
+     mov     r7, #1
+     add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
+ 
+     addcs   r2, r2, r4                  ; if  (bit) lowvalue += split
+     subcs   r4, r5, r4                  ; if  (bit) range = range-split
+ 
+     ; Counting the leading zeros is used to normalize range.
+     clz     r6, r4
+     sub     r6, r6, #24                 ; shift
+ 
+     ; Flag is set on the sum of count.  This flag is used later
+     ; to determine if count >= 0
+     adds    r3, r3, r6                  ; count += shift
+     lsl     r5, r4, r6                  ; range <<= shift
+     bmi     token_count_lt_zero_ev      ; if(count >= 0)
+ 
+     sub     r6, r6, r3                  ; offset = shift - count
+     sub     r4, r6, #1                  ; offset-1
+     lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+     bpl     token_high_bit_not_set_ev
+ 
+     ldr     r4, [r0, #vp8_writer_pos]   ; x
+     sub     r4, r4, #1                  ; x = w->pos-1
+     b       token_zero_while_start_ev
+ token_zero_while_loop_ev
+     mov     r9, #0
+     strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
+     sub     r4, r4, #1                  ; x--
+ token_zero_while_start_ev
+     cmp     r4, #0
+     ldrge   r7, [r0, #vp8_writer_buffer]
+     ldrb    r11, [r7, r4]
+     cmpge   r11, #0xff
+     beq     token_zero_while_loop_ev
+ 
+     ldr     r7, [r0, #vp8_writer_buffer]
+     ldrb    r9, [r7, r4]                ; w->buffer[x]
+     add     r9, r9, #1
+     strb    r9, [r7, r4]                ; w->buffer[x] + 1
+ token_high_bit_not_set_ev
+     rsb     r4, r6, #24                 ; 24-offset
+     ldr     r9, [r0, #vp8_writer_buffer]
+     lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+     ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+     lsl     r2, r2, r6                  ; lowvalue <<= offset
+     mov     r6, r3                      ; shift = count
+     add     r11, r4, #1                 ; w->pos++
+     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+     str     r11, [r0, #vp8_writer_pos]
+     sub     r3, r3, #8                  ; count -= 8
+ 
+     VALIDATE_POS r9, r11                ; validate_buffer at pos
+ 
+     strb    r7, [r9, r4]                ; w->buffer[w->pos++]
+ 
+ token_count_lt_zero_ev
+     lsl     r2, r2, r6                  ; lowvalue <<= shift
+ 
+     subs    r10, r10, #1
+     bne     encode_value_loop
+ 
+     str     r2, [r0, #vp8_writer_lowvalue]
+     str     r5, [r0, #vp8_writer_range]
+     str     r3, [r0, #vp8_writer_count]
+     pop     {r4-r12, pc}
+     ENDP
+ 
+     END
diff --cc vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
index 000000000,a1cd46704..90a141c62
mode 000000,100644..100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@@ -1,0 -1,317 +1,317 @@@
+ ;
+ ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ ;
+ ;  Use of this source code is governed by a BSD-style license
+ ;  that can be found in the LICENSE file in the root of the source
+ ;  tree. An additional intellectual property rights grant can be found
+ ;  in the file PATENTS.  All contributing project authors may
+ ;  be found in the AUTHORS file in the root of the source tree.
+ ;
+ 
+ 
+     EXPORT |vp8cx_pack_tokens_armv5|
+     IMPORT |vp8_validate_buffer_arm|
+ 
 -    INCLUDE asm_enc_offsets.asm
++    INCLUDE vp8_asm_enc_offsets.asm
+ 
+     ARM
+     REQUIRE8
+     PRESERVE8
+ 
+     AREA    |.text|, CODE, READONLY
+ 
+ 
+     ; macro for validating write buffer position
+     ; needs vp8_writer in r0
+     ; start shall not be in r1
+     MACRO
+     VALIDATE_POS $start, $pos
+     push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+     ldr  r2, [r0, #vp8_writer_buffer_end]
+     ldr  r3, [r0, #vp8_writer_error]
+     mov  r1, $pos
+     mov  r0, $start
+     bl   vp8_validate_buffer_arm
+     pop  {r0-r3, r12, lr}
+     MEND
+ 
+ 
+ ; r0 vp8_writer *w
+ ; r1 const TOKENEXTRA *p
+ ; r2 int xcount
+ ; r3 vp8_coef_encodings
+ ; s0 vp8_extra_bits
+ ; s1 vp8_coef_tree
+ |vp8cx_pack_tokens_armv5| PROC
+     push    {r4-r12, lr}
+     sub     sp, sp, #16
+ 
+     ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
+     ;  sizeof (TOKENEXTRA) is 8
+     add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)
+     str     r2, [sp, #0]
+     str     r3, [sp, #8]                ; save vp8_coef_encodings
+     ldr     r2, [r0, #vp8_writer_lowvalue]
+     ldr     r5, [r0, #vp8_writer_range]
+     ldr     r3, [r0, #vp8_writer_count]
+     b       check_p_lt_stop
+ 
+ while_p_lt_stop
+     ldrb    r6, [r1, #tokenextra_token] ; t
+     ldr     r4, [sp, #8]                ; vp8_coef_encodings
+     mov     lr, #0
+     add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
+     ldr     r9, [r1, #tokenextra_context_tree]   ; pp
+ 
+     ldrb    r7, [r1, #tokenextra_skip_eob_node]
+ 
+     ldr     r6, [r4, #vp8_token_value]  ; v
+     ldr     r8, [r4, #vp8_token_len]    ; n
+ 
+     ; vp8 specific skip_eob_node
+     cmp     r7, #0
+     movne   lr, #2                      ; i = 2
+     subne   r8, r8, #1                  ; --n
+ 
+     rsb     r4, r8, #32                 ; 32-n
+     ldr     r10, [sp, #60]              ; vp8_coef_tree
+ 
+     ; v is kept in r12 during the token pack loop
+     lsl     r12, r6, r4                ; r12 = v << 32 - n
+ 
+ ; loop start
+ token_loop
+     ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
+     sub     r7, r5, #1                  ; range-1
+ 
+     ; Decisions are made based on the bit value shifted
+     ; off of v, so set a flag here based on this.
+     ; This value is refered to as "bb"
+     lsls    r12, r12, #1                ; bb = v >> n
+     mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
+ 
+     ; bb can only be 0 or 1.  So only execute this statement
+     ; if bb == 1, otherwise it will act like i + 0
+     addcs   lr, lr, #1                  ; i + bb
+ 
+     mov     r7, #1
+     ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
+     add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+ 
+     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+     subcs   r4, r5, r4                  ; if  (bb) range = range-split
+ 
+     ; Counting the leading zeros is used to normalize range.
+     clz     r6, r4
+     sub     r6, r6, #24                 ; shift
+ 
+     ; Flag is set on the sum of count.  This flag is used later
+     ; to determine if count >= 0
+     adds    r3, r3, r6                  ; count += shift
+     lsl     r5, r4, r6                  ; range <<= shift
+     bmi     token_count_lt_zero         ; if(count >= 0)
+ 
+     sub     r6, r6, r3                  ; offset = shift - count
+     sub     r4, r6, #1                  ; offset-1
+     lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+     bpl     token_high_bit_not_set
+ 
+     ldr     r4, [r0, #vp8_writer_pos]   ; x
+     sub     r4, r4, #1                  ; x = w->pos-1
+     b       token_zero_while_start
+ token_zero_while_loop
+     mov     r10, #0
+     strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+     sub     r4, r4, #1                  ; x--
+ token_zero_while_start
+     cmp     r4, #0
+     ldrge   r7, [r0, #vp8_writer_buffer]
+     ldrb    r11, [r7, r4]
+     cmpge   r11, #0xff
+     beq     token_zero_while_loop
+ 
+     ldr     r7, [r0, #vp8_writer_buffer]
+     ldrb    r10, [r7, r4]               ; w->buffer[x]
+     add     r10, r10, #1
+     strb    r10, [r7, r4]               ; w->buffer[x] + 1
+ token_high_bit_not_set
+     rsb     r4, r6, #24                 ; 24-offset
+     ldr     r10, [r0, #vp8_writer_buffer]
+     lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+     ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+     lsl     r2, r2, r6                  ; lowvalue <<= offset
+     mov     r6, r3                      ; shift = count
+     add     r11, r4, #1                 ; w->pos++
+     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+     str     r11, [r0, #vp8_writer_pos]
+     sub     r3, r3, #8                  ; count -= 8
+ 
+     VALIDATE_POS r10, r11               ; validate_buffer at pos
+ 
+     strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+ 
+     ; r10 is used earlier in the loop, but r10 is used as
+     ; temp variable here.  So after r10 is used, reload
+     ; vp8_coef_tree_dcd into r10
+     ldr     r10, [sp, #60]              ; vp8_coef_tree
+ 
+ token_count_lt_zero
+     lsl     r2, r2, r6                  ; lowvalue <<= shift
+ 
+     subs    r8, r8, #1                  ; --n
+     bne     token_loop
+ 
+     ldrb    r6, [r1, #tokenextra_token] ; t
+     ldr     r7, [sp, #56]               ; vp8_extra_bits
+     ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+     ;  element.  Here vp8_extra_bit_struct == 16
+     add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
+ 
+     ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
+     cmp     r4, #0
+     beq     skip_extra_bits
+ 
+ ;   if( b->base_val)
+     ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
+     ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
+     cmp     r8, #0                      ; if( L)
+     beq     no_extra_bits
+ 
+     ldr     r9, [r12, #vp8_extra_bit_struct_prob]
+     asr     r7, lr, #1                  ; v=e>>1
+ 
+     ldr     r10, [r12, #vp8_extra_bit_struct_tree]
+     str     r10, [sp, #4]               ; b->tree
+ 
+     rsb     r4, r8, #32
+     lsl     r12, r7, r4
+ 
+     mov     lr, #0                      ; i = 0
+ 
+ extra_bits_loop
+     ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
+     sub     r7, r5, #1                  ; range-1
+     lsls    r12, r12, #1                ; v >> n
+     mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
+     addcs   lr, lr, #1                  ; i + bb
+ 
+     mov     r7, #1
+     ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
+     add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+ 
+     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+     subcs   r4, r5, r4                  ; if  (bb) range = range-split
+ 
+     clz     r6, r4
+     sub     r6, r6, #24
+ 
+     adds    r3, r3, r6                  ; count += shift
+     lsl     r5, r4, r6                  ; range <<= shift
+     bmi     extra_count_lt_zero         ; if(count >= 0)
+ 
+     sub     r6, r6, r3                  ; offset= shift - count
+     sub     r4, r6, #1                  ; offset-1
+     lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+     bpl     extra_high_bit_not_set
+ 
+     ldr     r4, [r0, #vp8_writer_pos]   ; x
+     sub     r4, r4, #1                  ; x = w->pos - 1
+     b       extra_zero_while_start
+ extra_zero_while_loop
+     mov     r10, #0
+     strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+     sub     r4, r4, #1                  ; x--
+ extra_zero_while_start
+     cmp     r4, #0
+     ldrge   r7, [r0, #vp8_writer_buffer]
+     ldrb    r11, [r7, r4]
+     cmpge   r11, #0xff
+     beq     extra_zero_while_loop
+ 
+     ldr     r7, [r0, #vp8_writer_buffer]
+     ldrb    r10, [r7, r4]
+     add     r10, r10, #1
+     strb    r10, [r7, r4]
+ extra_high_bit_not_set
+     rsb     r4, r6, #24                 ; 24-offset
+     ldr     r10, [r0, #vp8_writer_buffer]
+     lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+     ldr     r4, [r0, #vp8_writer_pos]
+     lsl     r2, r2, r6                  ; lowvalue <<= offset
+     mov     r6, r3                      ; shift = count
+     add     r11, r4, #1                 ; w->pos++
+     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+     str     r11, [r0, #vp8_writer_pos]
+     sub     r3, r3, #8                  ; count -= 8
+ 
+     VALIDATE_POS r10, r11               ; validate_buffer at pos
+ 
+     strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+     ldr     r10, [sp, #4]               ; b->tree
+ extra_count_lt_zero
+     lsl     r2, r2, r6
+ 
+     subs    r8, r8, #1                  ; --n
+     bne     extra_bits_loop             ; while (n)
+ 
+ no_extra_bits
+     ldr     lr, [r1, #4]                ; e = p->Extra
+     add     r4, r5, #1                  ; range + 1
+     tst     lr, #1
+     lsr     r4, r4, #1                  ; split = (range + 1) >> 1
+     addne   r2, r2, r4                  ; lowvalue += split
+     subne   r4, r5, r4                  ; range = range-split
+     tst     r2, #0x80000000             ; lowvalue & 0x80000000
+     lsl     r5, r4, #1                  ; range <<= 1
+     beq     end_high_bit_not_set
+ 
+     ldr     r4, [r0, #vp8_writer_pos]
+     mov     r7, #0
+     sub     r4, r4, #1
+     b       end_zero_while_start
+ end_zero_while_loop
+     strb    r7, [r6, r4]
+     sub     r4, r4, #1                  ; x--
+ end_zero_while_start
+     cmp     r4, #0
+     ldrge   r6, [r0, #vp8_writer_buffer]
+     ldrb    r12, [r6, r4]
+     cmpge   r12, #0xff
+     beq     end_zero_while_loop
+ 
+     ldr     r6, [r0, #vp8_writer_buffer]
+     ldrb    r7, [r6, r4]
+     add     r7, r7, #1
+     strb    r7, [r6, r4]
+ end_high_bit_not_set
+     adds    r3, r3, #1                  ; ++count
+     lsl     r2, r2, #1                  ; lowvalue  <<= 1
+     bne     end_count_zero
+ 
+     ldr     r4, [r0, #vp8_writer_pos]
+     mvn     r3, #7
+     ldr     r7, [r0, #vp8_writer_buffer]
+     lsr     r6, r2, #24                 ; lowvalue >> 24
+     add     r12, r4, #1                 ; w->pos++
+     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+     str     r12, [r0, #vp8_writer_pos]
+ 
+     VALIDATE_POS r7, r12               ; validate_buffer at pos
+ 
+     strb    r6, [r7, r4]
+ end_count_zero
+ skip_extra_bits
+     add     r1, r1, #TOKENEXTRA_SZ      ; ++p
+ check_p_lt_stop
+     ldr     r4, [sp, #0]                ; stop
+     cmp     r1, r4                      ; while( p < stop)
+     bcc     while_p_lt_stop
+ 
+     str     r2, [r0, #vp8_writer_lowvalue]
+     str     r5, [r0, #vp8_writer_range]
+     str     r3, [r0, #vp8_writer_count]
+     add     sp, sp, #16
+     pop     {r4-r12, pc}
+     ENDP
+ 
+     END
diff --cc vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
index 000000000,1fa5e6c22..3a8d17a81
mode 000000,100644..100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@@ -1,0 -1,352 +1,352 @@@
+ ;
+ ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ ;
+ ;  Use of this source code is governed by a BSD-style license
+ ;  that can be found in the LICENSE file in the root of the source
+ ;  tree. An additional intellectual property rights grant can be found
+ ;  in the file PATENTS.  All contributing project authors may
+ ;  be found in the AUTHORS file in the root of the source tree.
+ ;
+ 
+ 
+     EXPORT |vp8cx_pack_mb_row_tokens_armv5|
+     IMPORT |vp8_validate_buffer_arm|
+ 
 -    INCLUDE asm_enc_offsets.asm
++    INCLUDE vp8_asm_enc_offsets.asm
+ 
+     ARM
+     REQUIRE8
+     PRESERVE8
+ 
+     AREA    |.text|, CODE, READONLY
+ 
+ 
+     ; macro for validating write buffer position
+     ; needs vp8_writer in r0
+     ; start shall not be in r1
+     MACRO
+     VALIDATE_POS $start, $pos
+     push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+     ldr  r2, [r0, #vp8_writer_buffer_end]
+     ldr  r3, [r0, #vp8_writer_error]
+     mov  r1, $pos
+     mov  r0, $start
+     bl   vp8_validate_buffer_arm
+     pop  {r0-r3, r12, lr}
+     MEND
+ 
+ ; r0 VP8_COMP *cpi
+ ; r1 vp8_writer *w
+ ; r2 vp8_coef_encodings
+ ; r3 vp8_extra_bits
+ ; s0 vp8_coef_tree
+ 
+ |vp8cx_pack_mb_row_tokens_armv5| PROC
+     push    {r4-r12, lr}
+     sub     sp, sp, #24
+ 
+     ; Compute address of cpi->common.mb_rows
+     ldr     r4, _VP8_COMP_common_
+     ldr     r6, _VP8_COMMON_MBrows_
+     add     r4, r0, r4
+ 
+     ldr     r5, [r4, r6]                ; load up mb_rows
+ 
+     str     r2, [sp, #20]               ; save vp8_coef_encodings
+     str     r5, [sp, #12]               ; save mb_rows
+     str     r3, [sp, #8]                ; save vp8_extra_bits
+ 
+     ldr     r4, _VP8_COMP_tplist_
+     add     r4, r0, r4
+     ldr     r7, [r4, #0]                ; dereference cpi->tp_list
+ 
+     mov     r0, r1                      ; keep same as other loops
+ 
+     ldr     r2, [r0, #vp8_writer_lowvalue]
+     ldr     r5, [r0, #vp8_writer_range]
+     ldr     r3, [r0, #vp8_writer_count]
+ 
+ mb_row_loop
+ 
+     ldr     r1, [r7, #tokenlist_start]
+     ldr     r9, [r7, #tokenlist_stop]
+     str     r9, [sp, #0]                ; save stop for later comparison
+     str     r7, [sp, #16]               ; tokenlist address for next time
+ 
+     b       check_p_lt_stop
+ 
+     ; actuall work gets done here!
+ 
+ while_p_lt_stop
+     ldrb    r6, [r1, #tokenextra_token] ; t
+     ldr     r4, [sp, #20]               ; vp8_coef_encodings
+     mov     lr, #0
+     add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
+     ldr     r9, [r1, #tokenextra_context_tree]   ; pp
+ 
+     ldrb    r7, [r1, #tokenextra_skip_eob_node]
+ 
+     ldr     r6, [r4, #vp8_token_value]  ; v
+     ldr     r8, [r4, #vp8_token_len]    ; n
+ 
+     ; vp8 specific skip_eob_node
+     cmp     r7, #0
+     movne   lr, #2                      ; i = 2
+     subne   r8, r8, #1                  ; --n
+ 
+     rsb     r4, r8, #32                 ; 32-n
+     ldr     r10, [sp, #64]              ; vp8_coef_tree
+ 
+     ; v is kept in r12 during the token pack loop
+     lsl     r12, r6, r4                 ; r12 = v << 32 - n
+ 
+ ; loop start
+ token_loop
+     ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
+     sub     r7, r5, #1                  ; range-1
+ 
+     ; Decisions are made based on the bit value shifted
+     ; off of v, so set a flag here based on this.
+     ; This value is refered to as "bb"
+     lsls    r12, r12, #1                ; bb = v >> n
+     mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
+ 
+     ; bb can only be 0 or 1.  So only execute this statement
+     ; if bb == 1, otherwise it will act like i + 0
+     addcs   lr, lr, #1                  ; i + bb
+ 
+     mov     r7, #1
+     ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
+     add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+ 
+     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+     subcs   r4, r5, r4                  ; if  (bb) range = range-split
+ 
+     ; Counting the leading zeros is used to normalize range.
+     clz     r6, r4
+     sub     r6, r6, #24                 ; shift
+ 
+     ; Flag is set on the sum of count.  This flag is used later
+     ; to determine if count >= 0
+     adds    r3, r3, r6                  ; count += shift
+     lsl     r5, r4, r6                  ; range <<= shift
+     bmi     token_count_lt_zero         ; if(count >= 0)
+ 
+     sub     r6, r6, r3                  ; offset = shift - count
+     sub     r4, r6, #1                  ; offset-1
+     lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+     bpl     token_high_bit_not_set
+ 
+     ldr     r4, [r0, #vp8_writer_pos]   ; x
+     sub     r4, r4, #1                  ; x = w->pos-1
+     b       token_zero_while_start
+ token_zero_while_loop
+     mov     r10, #0
+     strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+     sub     r4, r4, #1                  ; x--
+ token_zero_while_start
+     cmp     r4, #0
+     ldrge   r7, [r0, #vp8_writer_buffer]
+     ldrb    r11, [r7, r4]
+     cmpge   r11, #0xff
+     beq     token_zero_while_loop
+ 
+     ldr     r7, [r0, #vp8_writer_buffer]
+     ldrb    r10, [r7, r4]               ; w->buffer[x]
+     add     r10, r10, #1
+     strb    r10, [r7, r4]               ; w->buffer[x] + 1
+ token_high_bit_not_set
+     rsb     r4, r6, #24                 ; 24-offset
+     ldr     r10, [r0, #vp8_writer_buffer]
+     lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+     ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+     lsl     r2, r2, r6                  ; lowvalue <<= offset
+     mov     r6, r3                      ; shift = count
+     add     r11, r4, #1                 ; w->pos++
+     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+     str     r11, [r0, #vp8_writer_pos]
+     sub     r3, r3, #8                  ; count -= 8
+ 
+     VALIDATE_POS r10, r11               ; validate_buffer at pos
+ 
+     strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+ 
+     ; r10 is used earlier in the loop, but r10 is used as
+     ; temp variable here.  So after r10 is used, reload
+     ; vp8_coef_tree_dcd into r10
+     ldr     r10, [sp, #64]              ; vp8_coef_tree
+ 
+ token_count_lt_zero
+     lsl     r2, r2, r6                  ; lowvalue <<= shift
+ 
+     subs    r8, r8, #1                  ; --n
+     bne     token_loop
+ 
+     ldrb    r6, [r1, #tokenextra_token] ; t
+     ldr     r7, [sp, #8]                ; vp8_extra_bits
+     ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+     ;  element.  Here vp8_extra_bit_struct == 16
+     add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
+ 
+     ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
+     cmp     r4, #0
+     beq     skip_extra_bits
+ 
+ ;   if( b->base_val)
+     ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
+     ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
+     cmp     r8, #0                      ; if( L)
+     beq     no_extra_bits
+ 
+     ldr     r9, [r12, #vp8_extra_bit_struct_prob]
+     asr     r7, lr, #1                  ; v=e>>1
+ 
+     ldr     r10, [r12, #vp8_extra_bit_struct_tree]
+     str     r10, [sp, #4]               ; b->tree
+ 
+     rsb     r4, r8, #32
+     lsl     r12, r7, r4
+ 
+     mov     lr, #0                      ; i = 0
+ 
+ extra_bits_loop
+     ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
+     sub     r7, r5, #1                  ; range-1
+     lsls    r12, r12, #1                ; v >> n
+     mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
+     addcs   lr, lr, #1                  ; i + bb
+ 
+     mov     r7, #1
+     ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
+     add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+ 
+     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+     subcs   r4, r5, r4                  ; if  (bb) range = range-split
+ 
+     clz     r6, r4
+     sub     r6, r6, #24
+ 
+     adds    r3, r3, r6                  ; count += shift
+     lsl     r5, r4, r6                  ; range <<= shift
+     bmi     extra_count_lt_zero         ; if(count >= 0)
+ 
+     sub     r6, r6, r3                  ; offset= shift - count
+     sub     r4, r6, #1                  ; offset-1
+     lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+     bpl     extra_high_bit_not_set
+ 
+     ldr     r4, [r0, #vp8_writer_pos]   ; x
+     sub     r4, r4, #1                  ; x = w->pos - 1
+     b       extra_zero_while_start
+ extra_zero_while_loop
+     mov     r10, #0
+     strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+     sub     r4, r4, #1                  ; x--
+ extra_zero_while_start
+     cmp     r4, #0
+     ldrge   r7, [r0, #vp8_writer_buffer]
+     ldrb    r11, [r7, r4]
+     cmpge   r11, #0xff
+     beq     extra_zero_while_loop
+ 
+     ldr     r7, [r0, #vp8_writer_buffer]
+     ldrb    r10, [r7, r4]
+     add     r10, r10, #1
+     strb    r10, [r7, r4]
+ extra_high_bit_not_set
+     rsb     r4, r6, #24                 ; 24-offset
+     ldr     r10, [r0, #vp8_writer_buffer]
+     lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+     ldr     r4, [r0, #vp8_writer_pos]
+     lsl     r2, r2, r6                  ; lowvalue <<= offset
+     mov     r6, r3                      ; shift = count
+     add     r11, r4, #1                 ; w->pos++
+     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+     str     r11, [r0, #vp8_writer_pos]
+     sub     r3, r3, #8                  ; count -= 8
+ 
+     VALIDATE_POS r10, r11               ; validate_buffer at pos
+ 
+     strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+     ldr     r10, [sp, #4]               ; b->tree
+ extra_count_lt_zero
+     lsl     r2, r2, r6
+ 
+     subs    r8, r8, #1                  ; --n
+     bne     extra_bits_loop             ; while (n)
+ 
+ no_extra_bits
+     ldr     lr, [r1, #4]                ; e = p->Extra
+     add     r4, r5, #1                  ; range + 1
+     tst     lr, #1
+     lsr     r4, r4, #1                  ; split = (range + 1) >> 1
+     addne   r2, r2, r4                  ; lowvalue += split
+     subne   r4, r5, r4                  ; range = range-split
+     tst     r2, #0x80000000             ; lowvalue & 0x80000000
+     lsl     r5, r4, #1                  ; range <<= 1
+     beq     end_high_bit_not_set
+ 
+     ldr     r4, [r0, #vp8_writer_pos]
+     mov     r7, #0
+     sub     r4, r4, #1
+     b       end_zero_while_start
+ end_zero_while_loop
+     strb    r7, [r6, r4]
+     sub     r4, r4, #1                  ; x--
+ end_zero_while_start
+     cmp     r4, #0
+     ldrge   r6, [r0, #vp8_writer_buffer]
+     ldrb    r12, [r6, r4]
+     cmpge   r12, #0xff
+     beq     end_zero_while_loop
+ 
+     ldr     r6, [r0, #vp8_writer_buffer]
+     ldrb    r7, [r6, r4]
+     add     r7, r7, #1
+     strb    r7, [r6, r4]
+ end_high_bit_not_set
+     adds    r3, r3, #1                  ; ++count
+     lsl     r2, r2, #1                  ; lowvalue  <<= 1
+     bne     end_count_zero
+ 
+     ldr     r4, [r0, #vp8_writer_pos]
+     mvn     r3, #7
+     ldr     r7, [r0, #vp8_writer_buffer]
+     lsr     r6, r2, #24                 ; lowvalue >> 24
+     add     r12, r4, #1                 ; w->pos++
+     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+     str     r12, [r0, #vp8_writer_pos]
+ 
+     VALIDATE_POS r7, r12               ; validate_buffer at pos
+ 
+     strb    r6, [r7, r4]
+ end_count_zero
+ skip_extra_bits
+     add     r1, r1, #TOKENEXTRA_SZ      ; ++p
+ check_p_lt_stop
+     ldr     r4, [sp, #0]                ; stop
+     cmp     r1, r4                      ; while( p < stop)
+     bcc     while_p_lt_stop
+ 
+     ldr     r6, [sp, #12]               ; mb_rows
+     ldr     r7, [sp, #16]               ; tokenlist address
+     subs    r6, r6, #1
+     add     r7, r7, #TOKENLIST_SZ       ; next element in the array
+     str     r6, [sp, #12]
+     bne     mb_row_loop
+ 
+     str     r2, [r0, #vp8_writer_lowvalue]
+     str     r5, [r0, #vp8_writer_range]
+     str     r3, [r0, #vp8_writer_count]
+     add     sp, sp, #24
+     pop     {r4-r12, pc}
+     ENDP
+ 
+ _VP8_COMP_common_
+     DCD     vp8_comp_common
+ _VP8_COMMON_MBrows_
+     DCD     vp8_common_mb_rows
+ _VP8_COMP_tplist_
+     DCD     vp8_comp_tplist
+ 
+     END
diff --cc vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
index 000000000,90a98fe8d..e9aa4958f
mode 000000,100644..100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@@ -1,0 -1,471 +1,471 @@@
+ ;
+ ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ ;
+ ;  Use of this source code is governed by a BSD-style license
+ ;  that can be found in the LICENSE file in the root of the source
+ ;  tree. An additional intellectual property rights grant can be found
+ ;  in the file PATENTS.  All contributing project authors may
+ ;  be found in the AUTHORS file in the root of the source tree.
+ ;
+ 
+ 
+     EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
+     IMPORT |vp8_validate_buffer_arm|
+ 
 -    INCLUDE asm_enc_offsets.asm
++    INCLUDE vp8_asm_enc_offsets.asm
+ 
+     ARM
+     REQUIRE8
+     PRESERVE8
+ 
+     AREA    |.text|, CODE, READONLY
+ 
+     ; macro for validating write buffer position
+     ; needs vp8_writer in r0
+     ; start shall not be in r1
+     MACRO
+     VALIDATE_POS $start, $pos
+     push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+     ldr  r2, [r0, #vp8_writer_buffer_end]
+     ldr  r3, [r0, #vp8_writer_error]
+     mov  r1, $pos
+     mov  r0, $start
+     bl   vp8_validate_buffer_arm
+     pop  {r0-r3, r12, lr}
+     MEND
+ 
+ ; r0 VP8_COMP *cpi
+ ; r1 unsigned char *cx_data
+ ; r2 const unsigned char *cx_data_end
+ ; r3 int num_part
+ ; s0 vp8_coef_encodings
+ ; s1 vp8_extra_bits,
+ ; s2 const vp8_tree_index *
+ 
+ |vp8cx_pack_tokens_into_partitions_armv5| PROC
+     push    {r4-r12, lr}
+     sub     sp, sp, #40
+ 
+     ; Compute address of cpi->common.mb_rows
+     ldr     r4, _VP8_COMP_common_
+     ldr     r6, _VP8_COMMON_MBrows_
+     add     r4, r0, r4
+ 
+     ldr     r5, [r4, r6]                ; load up mb_rows
+ 
+     str     r5, [sp, #36]               ; save mb_rows
+     str     r1, [sp, #24]               ; save ptr = cx_data
+     str     r3, [sp, #20]               ; save num_part
+     str     r2, [sp, #8]                ; save cx_data_end
+ 
+     ldr     r4, _VP8_COMP_tplist_
+     add     r4, r0, r4
+     ldr     r7, [r4, #0]                ; dereference cpi->tp_list
+     str     r7, [sp, #32]               ; store start of cpi->tp_list
+ 
+     ldr     r11, _VP8_COMP_bc_          ; load up vp8_writer out of cpi
+     add     r0, r0, r11
+ 
+     mov     r11, #0
+     str     r11, [sp, #28]              ; i
+ 
+ numparts_loop
+     ldr     r2, _vp8_writer_sz_         ; load up sizeof(vp8_writer)
+     add     r0, r2                      ; bc[i + 1]
+ 
+     ldr     r10, [sp, #24]              ; ptr
+     ldr     r5,  [sp, #36]              ; move mb_rows to the counting section
+     subs    r5, r5, r11                 ; move start point with each partition
+                                         ; mb_rows starts at i
+     str     r5,  [sp, #12]
+ 
+     ; Reset all of the VP8 Writer data for each partition that
+     ; is processed.
+     ; start_encode
+ 
+     ldr     r3, [sp, #8]
+     str     r3, [r0, #vp8_writer_buffer_end]
+ 
+     mov     r2, #0                      ; vp8_writer_lowvalue
+     mov     r5, #255                    ; vp8_writer_range
+     mvn     r3, #23                     ; vp8_writer_count
+ 
+     str     r2,  [r0, #vp8_writer_pos]
+     str     r10, [r0, #vp8_writer_buffer]
+ 
+     ble     end_partition               ; if (mb_rows <= 0) end partition
+ 
+ mb_row_loop
+ 
+     ldr     r1, [r7, #tokenlist_start]
+     ldr     r9, [r7, #tokenlist_stop]
+     str     r9, [sp, #0]                ; save stop for later comparison
+     str     r7, [sp, #16]               ; tokenlist address for next time
+ 
+     b       check_p_lt_stop
+ 
+     ; actual work gets done here!
+ 
+ while_p_lt_stop
+     ldrb    r6, [r1, #tokenextra_token] ; t
+     ldr     r4, [sp, #80]               ; vp8_coef_encodings
+     mov     lr, #0
+     add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
+     ldr     r9, [r1, #tokenextra_context_tree]   ; pp
+ 
+     ldrb    r7, [r1, #tokenextra_skip_eob_node]
+ 
+     ldr     r6, [r4, #vp8_token_value]  ; v
+     ldr     r8, [r4, #vp8_token_len]    ; n
+ 
+     ; vp8 specific skip_eob_node
+     cmp     r7, #0
+     movne   lr, #2                      ; i = 2
+     subne   r8, r8, #1                  ; --n
+ 
+     rsb     r4, r8, #32                 ; 32-n
+     ldr     r10, [sp, #88]              ; vp8_coef_tree
+ 
+     ; v is kept in r12 during the token pack loop
+     lsl     r12, r6, r4                ; r12 = v << 32 - n
+ 
+ ; loop start
+ token_loop
+     ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
+     sub     r7, r5, #1                  ; range-1
+ 
+     ; Decisions are made based on the bit value shifted
+     ; off of v, so set a flag here based on this.
+     ; This value is refered to as "bb"
+     lsls    r12, r12, #1                ; bb = v >> n
+     mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
+ 
+     ; bb can only be 0 or 1.  So only execute this statement
+     ; if bb == 1, otherwise it will act like i + 0
+     addcs   lr, lr, #1                  ; i + bb
+ 
+     mov     r7, #1
+     ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
+     add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+ 
+     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+     subcs   r4, r5, r4                  ; if  (bb) range = range-split
+ 
+     ; Counting the leading zeros is used to normalize range.
+     clz     r6, r4
+     sub     r6, r6, #24                 ; shift
+ 
+     ; Flag is set on the sum of count.  This flag is used later
+     ; to determine if count >= 0
+     adds    r3, r3, r6                  ; count += shift
+     lsl     r5, r4, r6                  ; range <<= shift
+     bmi     token_count_lt_zero         ; if(count >= 0)
+ 
+     sub     r6, r6, r3                  ; offset = shift - count
+     sub     r4, r6, #1                  ; offset-1
+     lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+     bpl     token_high_bit_not_set
+ 
+     ldr     r4, [r0, #vp8_writer_pos]   ; x
+     sub     r4, r4, #1                  ; x = w->pos-1
+     b       token_zero_while_start
+ token_zero_while_loop
+     mov     r10, #0
+     strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+     sub     r4, r4, #1                  ; x--
+ token_zero_while_start
+     cmp     r4, #0
+     ldrge   r7, [r0, #vp8_writer_buffer]
+     ldrb    r11, [r7, r4]
+     cmpge   r11, #0xff
+     beq     token_zero_while_loop
+ 
+     ldr     r7, [r0, #vp8_writer_buffer]
+     ldrb    r10, [r7, r4]               ; w->buffer[x]
+     add     r10, r10, #1
+     strb    r10, [r7, r4]               ; w->buffer[x] + 1
+ token_high_bit_not_set
+     rsb     r4, r6, #24                 ; 24-offset
+     ldr     r10, [r0, #vp8_writer_buffer]
+     lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+     ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+     lsl     r2, r2, r6                  ; lowvalue <<= offset
+     mov     r6, r3                      ; shift = count
+     add     r11, r4, #1                 ; w->pos++
+     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+     str     r11, [r0, #vp8_writer_pos]
+     sub     r3, r3, #8                  ; count -= 8
+ 
+     VALIDATE_POS r10, r11               ; validate_buffer at pos
+ 
+     strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+ 
+     ; r10 is used earlier in the loop, but r10 is used as
+     ; temp variable here.  So after r10 is used, reload
+     ; vp8_coef_tree_dcd into r10
+     ldr     r10, [sp, #88]              ; vp8_coef_tree
+ 
+ token_count_lt_zero
+     lsl     r2, r2, r6                  ; lowvalue <<= shift
+ 
+     subs    r8, r8, #1                  ; --n
+     bne     token_loop
+ 
+     ldrb    r6, [r1, #tokenextra_token] ; t
+     ldr     r7, [sp, #84]                ; vp8_extra_bits
+     ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+     ;  element.  Here vp8_extra_bit_struct == 16
+     add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
+ 
+     ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
+     cmp     r4, #0
+     beq     skip_extra_bits
+ 
+ ;   if( b->base_val)
+     ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
+     ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
+     cmp     r8, #0                      ; if( L)
+     beq     no_extra_bits
+ 
+     ldr     r9, [r12, #vp8_extra_bit_struct_prob]
+     asr     r7, lr, #1                  ; v=e>>1
+ 
+     ldr     r10, [r12, #vp8_extra_bit_struct_tree]
+     str     r10, [sp, #4]               ; b->tree
+ 
+     rsb     r4, r8, #32
+     lsl     r12, r7, r4
+ 
+     mov     lr, #0                      ; i = 0
+ 
+ extra_bits_loop
+     ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]
+     sub     r7, r5, #1                  ; range-1
+     lsls    r12, r12, #1                ; v >> n
+     mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
+     addcs   lr, lr, #1                  ; i + bb
+ 
+     mov     r7, #1
+     ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
+     add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+ 
+     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+     subcs   r4, r5, r4                  ; if  (bb) range = range-split
+ 
+     clz     r6, r4
+     sub     r6, r6, #24
+ 
+     adds    r3, r3, r6                  ; count += shift
+     lsl     r5, r4, r6                  ; range <<= shift
+     bmi     extra_count_lt_zero         ; if(count >= 0)
+ 
+     sub     r6, r6, r3                  ; offset= shift - count
+     sub     r4, r6, #1                  ; offset-1
+     lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+     bpl     extra_high_bit_not_set
+ 
+     ldr     r4, [r0, #vp8_writer_pos]   ; x
+     sub     r4, r4, #1                  ; x = w->pos - 1
+     b       extra_zero_while_start
+ extra_zero_while_loop
+     mov     r10, #0
+     strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+     sub     r4, r4, #1                  ; x--
+ extra_zero_while_start
+     cmp     r4, #0
+     ldrge   r7, [r0, #vp8_writer_buffer]
+     ldrb    r11, [r7, r4]
+     cmpge   r11, #0xff
+     beq     extra_zero_while_loop
+ 
+     ldr     r7, [r0, #vp8_writer_buffer]
+     ldrb    r10, [r7, r4]
+     add     r10, r10, #1
+     strb    r10, [r7, r4]
+ extra_high_bit_not_set
+     rsb     r4, r6, #24                 ; 24-offset
+     ldr     r10, [r0, #vp8_writer_buffer]
+     lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+     ldr     r4, [r0, #vp8_writer_pos]
+     lsl     r2, r2, r6                  ; lowvalue <<= offset
+     mov     r6, r3                      ; shift = count
+     add     r11, r4, #1                 ; w->pos++
+     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+     str     r11, [r0, #vp8_writer_pos]
+     sub     r3, r3, #8                  ; count -= 8
+ 
+     VALIDATE_POS r10, r11               ; validate_buffer at pos
+ 
+     strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+     ldr     r10, [sp, #4]               ; b->tree
+ extra_count_lt_zero
+     lsl     r2, r2, r6
+ 
+     subs    r8, r8, #1                  ; --n
+     bne     extra_bits_loop             ; while (n)
+ 
+ no_extra_bits
+     ldr     lr, [r1, #4]                ; e = p->Extra
+     add     r4, r5, #1                  ; range + 1
+     tst     lr, #1
+     lsr     r4, r4, #1                  ; split = (range + 1) >> 1
+     addne   r2, r2, r4                  ; lowvalue += split
+     subne   r4, r5, r4                  ; range = range-split
+     tst     r2, #0x80000000             ; lowvalue & 0x80000000
+     lsl     r5, r4, #1                  ; range <<= 1
+     beq     end_high_bit_not_set
+ 
+     ldr     r4, [r0, #vp8_writer_pos]
+     mov     r7, #0
+     sub     r4, r4, #1
+     b       end_zero_while_start
+ end_zero_while_loop
+     strb    r7, [r6, r4]
+     sub     r4, r4, #1                  ; x--
+ end_zero_while_start
+     cmp     r4, #0
+     ldrge   r6, [r0, #vp8_writer_buffer]
+     ldrb    r12, [r6, r4]
+     cmpge   r12, #0xff
+     beq     end_zero_while_loop
+ 
+     ldr     r6, [r0, #vp8_writer_buffer]
+     ldrb    r7, [r6, r4]
+     add     r7, r7, #1
+     strb    r7, [r6, r4]
+ end_high_bit_not_set
+     adds    r3, r3, #1                  ; ++count
+     lsl     r2, r2, #1                  ; lowvalue  <<= 1
+     bne     end_count_zero
+ 
+     ldr     r4, [r0, #vp8_writer_pos]
+     mvn     r3, #7                      ; count = -8
+     ldr     r7, [r0, #vp8_writer_buffer]
+     lsr     r6, r2, #24                 ; lowvalue >> 24
+     add     r12, r4, #1                 ; w->pos++
+     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+     str     r12, [r0, #vp8_writer_pos]
+ 
+     VALIDATE_POS r7, r12                ; validate_buffer at pos
+ 
+     strb    r6, [r7, r4]
+ end_count_zero
+ skip_extra_bits
+     add     r1, r1, #TOKENEXTRA_SZ      ; ++p
+ check_p_lt_stop
+     ldr     r4, [sp, #0]                ; stop
+     cmp     r1, r4                      ; while( p < stop)
+     bcc     while_p_lt_stop
+ 
+     ldr     r10, [sp, #20]              ; num_parts
+     mov     r1, #TOKENLIST_SZ
+     mul     r1, r10, r1
+ 
+     ldr     r6, [sp, #12]               ; mb_rows
+     ldr     r7, [sp, #16]               ; tokenlist address
+     subs    r6, r6, r10
+     add     r7, r7, r1                  ; next element in the array
+     str     r6, [sp, #12]
+     bgt     mb_row_loop
+ 
+ end_partition
+     mov     r12, #32
+ 
+ stop_encode_loop
+     sub     r7, r5, #1                  ; range-1
+ 
+     mov     r4, r7, lsl #7              ; ((range-1) * 128)
+ 
+     mov     r7, #1
+     add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
+ 
+     ; Counting the leading zeros is used to normalize range.
+     clz     r6, r4
+     sub     r6, r6, #24                 ; shift
+ 
+     ; Flag is set on the sum of count.  This flag is used later
+     ; to determine if count >= 0
+     adds    r3, r3, r6                  ; count += shift
+     lsl     r5, r4, r6                  ; range <<= shift
+     bmi     token_count_lt_zero_se      ; if(count >= 0)
+ 
+     sub     r6, r6, r3                  ; offset = shift - count
+     sub     r4, r6, #1                  ; offset-1
+     lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+     bpl     token_high_bit_not_set_se
+ 
+     ldr     r4, [r0, #vp8_writer_pos]   ; x
+     sub     r4, r4, #1                  ; x = w->pos-1
+     b       token_zero_while_start_se
+ token_zero_while_loop_se
+     mov     r10, #0
+     strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+     sub     r4, r4, #1                  ; x--
+ token_zero_while_start_se
+     cmp     r4, #0
+     ldrge   r7, [r0, #vp8_writer_buffer]
+     ldrb    r11, [r7, r4]
+     cmpge   r11, #0xff
+     beq     token_zero_while_loop_se
+ 
+     ldr     r7, [r0, #vp8_writer_buffer]
+     ldrb    r10, [r7, r4]               ; w->buffer[x]
+     add     r10, r10, #1
+     strb    r10, [r7, r4]               ; w->buffer[x] + 1
+ token_high_bit_not_set_se
+     rsb     r4, r6, #24                 ; 24-offset
+     ldr     r10, [r0, #vp8_writer_buffer]
+     lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+     ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+     lsl     r2, r2, r6                  ; lowvalue <<= offset
+     mov     r6, r3                      ; shift = count
+     add     r11, r4, #1                 ; w->pos++
+     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+     str     r11, [r0, #vp8_writer_pos]
+     sub     r3, r3, #8                  ; count -= 8
+ 
+     VALIDATE_POS r10, r11               ; validate_buffer at pos
+ 
+     strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+ 
+ token_count_lt_zero_se
+     lsl     r2, r2, r6                  ; lowvalue <<= shift
+ 
+     subs    r12, r12, #1
+     bne     stop_encode_loop
+ 
+     ldr     r4,  [r0, #vp8_writer_pos]  ; w->pos
+     ldr     r12, [sp, #24]              ; ptr
+     add     r12, r12, r4                ; ptr += w->pos
+     str     r12, [sp, #24]
+ 
+     ldr     r11, [sp, #28]              ; i
+     ldr     r10, [sp, #20]              ; num_parts
+ 
+     add     r11, r11, #1                ; i++
+     str     r11, [sp, #28]
+ 
+     ldr     r7, [sp, #32]               ; cpi->tp_list[i]
+     mov     r1, #TOKENLIST_SZ
+     add     r7, r7, r1                  ; next element in cpi->tp_list
+     str     r7, [sp, #32]               ; cpi->tp_list[i+1]
+ 
+     cmp     r10, r11
+     bgt     numparts_loop
+ 
+     add     sp, sp, #40
+     pop     {r4-r12, pc}
+     ENDP
+ 
+ _VP8_COMP_common_
+     DCD     vp8_comp_common
+ _VP8_COMMON_MBrows_
+     DCD     vp8_common_mb_rows
+ _VP8_COMP_tplist_
+     DCD     vp8_comp_tplist
+ _VP8_COMP_bc_
+     DCD     vp8_comp_bc
+ _vp8_writer_sz_
+     DCD     vp8_writer_sz
+ 
+     END
diff --cc vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
index 000000000,d61f5d94d..de35a1e13
mode 000000,100644..100644
--- a/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
@@@ -1,0 -1,225 +1,225 @@@
+ ;
+ ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ ;
+ ;  Use of this source code is governed by a BSD-style license
+ ;  that can be found in the LICENSE file in the root of the source
+ ;  tree. An additional intellectual property rights grant can be found
+ ;  in the file PATENTS.  All contributing project authors may
+ ;  be found in the AUTHORS file in the root of the source tree.
+ ;
+ 
+ 
+     EXPORT  |vp8_fast_quantize_b_armv6|
+ 
 -    INCLUDE asm_enc_offsets.asm
++    INCLUDE vp8_asm_enc_offsets.asm
+ 
+     ARM
+     REQUIRE8
+     PRESERVE8
+ 
+     AREA ||.text||, CODE, READONLY, ALIGN=2
+ 
+ ; r0    BLOCK *b
+ ; r1    BLOCKD *d
+ |vp8_fast_quantize_b_armv6| PROC
+     stmfd   sp!, {r1, r4-r11, lr}
+ 
+     ldr     r3, [r0, #vp8_block_coeff]      ; coeff
+     ldr     r4, [r0, #vp8_block_quant_fast] ; quant_fast
+     ldr     r5, [r0, #vp8_block_round]      ; round
+     ldr     r6, [r1, #vp8_blockd_qcoeff]    ; qcoeff
+     ldr     r7, [r1, #vp8_blockd_dqcoeff]   ; dqcoeff
+     ldr     r8, [r1, #vp8_blockd_dequant]   ; dequant
+ 
+     ldr     r2, loop_count          ; loop_count=0x1000000. 'lsls' instruction
+                                     ; is used to update the counter so that
+                                     ; it can be used to mark nonzero
+                                     ; quantized coefficient pairs.
+ 
+     mov     r1, #0                  ; flags for quantized coeffs
+ 
+     ; PART 1: quantization and dequantization loop
+ loop
+     ldr     r9, [r3], #4            ; [z1 | z0]
+     ldr     r10, [r5], #4           ; [r1 | r0]
+     ldr     r11, [r4], #4           ; [q1 | q0]
+ 
+     ssat16  lr, #1, r9              ; [sz1 | sz0]
+     eor     r9, r9, lr              ; [z1 ^ sz1 | z0 ^ sz0]
+     ssub16  r9, r9, lr              ; x = (z ^ sz) - sz
+     sadd16  r9, r9, r10             ; [x1+r1 | x0+r0]
+ 
+     ldr     r12, [r3], #4           ; [z3 | z2]
+ 
+     smulbb  r0, r9, r11             ; [(x0+r0)*q0]
+     smultt  r9, r9, r11             ; [(x1+r1)*q1]
+ 
+     ldr     r10, [r5], #4           ; [r3 | r2]
+ 
+     ssat16  r11, #1, r12            ; [sz3 | sz2]
+     eor     r12, r12, r11           ; [z3 ^ sz3 | z2 ^ sz2]
+     pkhtb   r0, r9, r0, asr #16     ; [y1 | y0]
+     ldr     r9, [r4], #4            ; [q3 | q2]
+     ssub16  r12, r12, r11           ; x = (z ^ sz) - sz
+ 
+     sadd16  r12, r12, r10           ; [x3+r3 | x2+r2]
+ 
+     eor     r0, r0, lr              ; [(y1 ^ sz1) | (y0 ^ sz0)]
+ 
+     smulbb  r10, r12, r9            ; [(x2+r2)*q2]
+     smultt  r12, r12, r9            ; [(x3+r3)*q3]
+ 
+     ssub16  r0, r0, lr              ; x = (y ^ sz) - sz
+ 
+     cmp     r0, #0                  ; check if zero
+     orrne   r1, r1, r2, lsr #24     ; add flag for nonzero coeffs
+ 
+     str     r0, [r6], #4            ; *qcoeff++ = x
+     ldr     r9, [r8], #4            ; [dq1 | dq0]
+ 
+     pkhtb   r10, r12, r10, asr #16  ; [y3 | y2]
+     eor     r10, r10, r11           ; [(y3 ^ sz3) | (y2 ^ sz2)]
+     ssub16  r10, r10, r11           ; x = (y ^ sz) - sz
+ 
+     cmp     r10, #0                 ; check if zero
+     orrne   r1, r1, r2, lsr #23     ; add flag for nonzero coeffs
+ 
+     str     r10, [r6], #4           ; *qcoeff++ = x
+     ldr     r11, [r8], #4           ; [dq3 | dq2]
+ 
+     smulbb  r12, r0, r9             ; [x0*dq0]
+     smultt  r0, r0, r9              ; [x1*dq1]
+ 
+     smulbb  r9, r10, r11            ; [x2*dq2]
+     smultt  r10, r10, r11           ; [x3*dq3]
+ 
+     lsls    r2, r2, #2              ; update loop counter
+     strh    r12, [r7, #0]           ; dqcoeff[0] = [x0*dq0]
+     strh    r0, [r7, #2]            ; dqcoeff[1] = [x1*dq1]
+     strh    r9, [r7, #4]            ; dqcoeff[2] = [x2*dq2]
+     strh    r10, [r7, #6]           ; dqcoeff[3] = [x3*dq3]
+     add     r7, r7, #8              ; dqcoeff += 8
+     bne     loop
+ 
+     ; PART 2: check position for eob...
+     ldr     r11, [sp, #0]           ; restore BLOCKD pointer
+     mov     lr, #0                  ; init eob
+     cmp     r1, #0                  ; coeffs after quantization?
+     ldr     r12, [r11, #vp8_blockd_eob]
+     beq     end                     ; skip eob calculations if all zero
+ 
+     ldr     r0, [r11, #vp8_blockd_qcoeff]
+ 
+     ; check shortcut for nonzero qcoeffs
+     tst    r1, #0x80
+     bne    quant_coeff_15_14
+     tst    r1, #0x20
+     bne    quant_coeff_13_11
+     tst    r1, #0x8
+     bne    quant_coeff_12_7
+     tst    r1, #0x40
+     bne    quant_coeff_10_9
+     tst    r1, #0x10
+     bne    quant_coeff_8_3
+     tst    r1, #0x2
+     bne    quant_coeff_6_5
+     tst    r1, #0x4
+     bne    quant_coeff_4_2
+     b      quant_coeff_1_0
+ 
+ quant_coeff_15_14
+     ldrh    r2, [r0, #30]       ; rc=15, i=15
+     mov     lr, #16
+     cmp     r2, #0
+     bne     end
+ 
+     ldrh    r3, [r0, #28]       ; rc=14, i=14
+     mov     lr, #15
+     cmp     r3, #0
+     bne     end
+ 
+ quant_coeff_13_11
+     ldrh    r2, [r0, #22]       ; rc=11, i=13
+     mov     lr, #14
+     cmp     r2, #0
+     bne     end
+ 
+ quant_coeff_12_7
+     ldrh    r3, [r0, #14]       ; rc=7,  i=12
+     mov     lr, #13
+     cmp     r3, #0
+     bne     end
+ 
+     ldrh    r2, [r0, #20]       ; rc=10, i=11
+     mov     lr, #12
+     cmp     r2, #0
+     bne     end
+ 
+ quant_coeff_10_9
+     ldrh    r3, [r0, #26]       ; rc=13, i=10
+     mov     lr, #11
+     cmp     r3, #0
+     bne     end
+ 
+     ldrh    r2, [r0, #24]       ; rc=12, i=9
+     mov     lr, #10
+     cmp     r2, #0
+     bne     end
+ 
+ quant_coeff_8_3
+     ldrh    r3, [r0, #18]       ; rc=9,  i=8
+     mov     lr, #9
+     cmp     r3, #0
+     bne     end
+ 
+     ldrh    r2, [r0, #12]       ; rc=6,  i=7
+     mov     lr, #8
+     cmp     r2, #0
+     bne     end
+ 
+ quant_coeff_6_5
+     ldrh    r3, [r0, #6]        ; rc=3,  i=6
+     mov     lr, #7
+     cmp     r3, #0
+     bne     end
+ 
+     ldrh    r2, [r0, #4]        ; rc=2,  i=5
+     mov     lr, #6
+     cmp     r2, #0
+     bne     end
+ 
+ quant_coeff_4_2
+     ldrh    r3, [r0, #10]       ; rc=5,  i=4
+     mov     lr, #5
+     cmp     r3, #0
+     bne     end
+ 
+     ldrh    r2, [r0, #16]       ; rc=8,  i=3
+     mov     lr, #4
+     cmp     r2, #0
+     bne     end
+ 
+     ldrh    r3, [r0, #8]        ; rc=4,  i=2
+     mov     lr, #3
+     cmp     r3, #0
+     bne     end
+ 
+ quant_coeff_1_0
+     ldrh    r2, [r0, #2]        ; rc=1,  i=1
+     mov     lr, #2
+     cmp     r2, #0
+     bne     end
+ 
+     mov     lr, #1              ; rc=0,  i=0
+ 
+ end
+     strb    lr, [r12]
+     ldmfd   sp!, {r1, r4-r11, pc}
+ 
+     ENDP
+ 
+ loop_count
+     DCD     0x1000000
+ 
+     END
+ 
diff --cc vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
index 000000000,f329f8f73..05746cf7f
mode 000000,100644..100644
--- a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
@@@ -1,0 -1,272 +1,272 @@@
+ ;
+ ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ ;
+ ;  Use of this source code is governed by a BSD-style license
+ ;  that can be found in the LICENSE file in the root of the source
+ ;  tree. An additional intellectual property rights grant can be found
+ ;  in the file PATENTS.  All contributing project authors may
+ ;  be found in the AUTHORS file in the root of the source tree.
+ ;
+ 
+ 
+     EXPORT  |vp8_subtract_mby_armv6|
+     EXPORT  |vp8_subtract_mbuv_armv6|
+     EXPORT  |vp8_subtract_b_armv6|
+ 
 -    INCLUDE asm_enc_offsets.asm
++    INCLUDE vp8_asm_enc_offsets.asm
+ 
+     ARM
+     REQUIRE8
+     PRESERVE8
+ 
+     AREA ||.text||, CODE, READONLY, ALIGN=2
+ 
+ ; r0    BLOCK *be
+ ; r1    BLOCKD *bd
+ ; r2    int pitch
+ |vp8_subtract_b_armv6| PROC
+ 
+     stmfd   sp!, {r4-r9}
+ 
+     ldr     r4, [r0, #vp8_block_base_src]
+     ldr     r5, [r0, #vp8_block_src]
+     ldr     r6, [r0, #vp8_block_src_diff]
+ 
+     ldr     r3, [r4]
+     ldr     r7, [r0, #vp8_block_src_stride]
+     add     r3, r3, r5          ; src = *base_src + src
+     ldr     r8, [r1, #vp8_blockd_predictor]
+ 
+     mov     r9, #4              ; loop count
+ 
+ loop_block
+ 
+     ldr     r0, [r3], r7        ; src
+     ldr     r1, [r8], r2        ; pred
+ 
+     uxtb16  r4, r0              ; [s2 | s0]
+     uxtb16  r5, r1              ; [p2 | p0]
+     uxtb16  r0, r0, ror #8      ; [s3 | s1]
+     uxtb16  r1, r1, ror #8      ; [p3 | p1]
+ 
+     usub16  r4, r4, r5          ; [d2 | d0]
+     usub16  r5, r0, r1          ; [d3 | d1]
+ 
+     subs    r9, r9, #1          ; decrement loop counter
+ 
+     pkhbt   r0, r4, r5, lsl #16 ; [d1 | d0]
+     pkhtb   r1, r5, r4, asr #16 ; [d3 | d2]
+ 
+     str     r0, [r6, #0]        ; diff
+     str     r1, [r6, #4]        ; diff
+ 
+     add     r6, r6, r2, lsl #1  ; update diff pointer
+     bne     loop_block
+ 
+     ldmfd   sp!, {r4-r9}
+     mov     pc, lr
+ 
+     ENDP
+ 
+ 
+ ; r0    short *diff
+ ; r1    unsigned char *usrc
+ ; r2    unsigned char *vsrc
+ ; r3    int src_stride
+ ; sp    unsigned char *upred
+ ; sp    unsigned char *vpred
+ ; sp    int pred_stride
+ |vp8_subtract_mbuv_armv6| PROC
+ 
+     stmfd   sp!, {r4-r11}
+ 
+     add     r0, r0, #512        ; set *diff point to Cb
+     mov     r4, #8              ; loop count
+     ldr     r5, [sp, #32]       ; upred
+     ldr     r12, [sp, #40]      ; pred_stride
+ 
+     ; Subtract U block
+ loop_u
+     ldr     r6, [r1]            ; usrc      (A)
+     ldr     r7, [r5]            ; upred     (A)
+ 
+     uxtb16  r8, r6              ; [s2 | s0] (A)
+     uxtb16  r9, r7              ; [p2 | p0] (A)
+     uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+     uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+ 
+     usub16  r6, r8, r9          ; [d2 | d0] (A)
+     usub16  r7, r10, r11        ; [d3 | d1] (A)
+ 
+     ldr     r10, [r1, #4]       ; usrc      (B)
+     ldr     r11, [r5, #4]       ; upred     (B)
+ 
+     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+ 
+     str     r8, [r0], #4        ; diff      (A)
+     uxtb16  r8, r10             ; [s2 | s0] (B)
+     str     r9, [r0], #4        ; diff      (A)
+ 
+     uxtb16  r9, r11             ; [p2 | p0] (B)
+     uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+     uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+ 
+     usub16  r6, r8, r9          ; [d2 | d0] (B)
+     usub16  r7, r10, r11        ; [d3 | d1] (B)
+ 
+     add     r1, r1, r3          ; update usrc pointer
+     add     r5, r5, r12         ; update upred pointer
+ 
+     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+ 
+     str     r8, [r0], #4        ; diff      (B)
+     subs    r4, r4, #1          ; update loop counter
+     str     r9, [r0], #4        ; diff      (B)
+ 
+     bne     loop_u
+ 
+     ldr     r5, [sp, #36]       ; vpred
+     mov     r4, #8              ; loop count
+ 
+     ; Subtract V block
+ loop_v
+     ldr     r6, [r2]            ; vsrc      (A)
+     ldr     r7, [r5]            ; vpred     (A)
+ 
+     uxtb16  r8, r6              ; [s2 | s0] (A)
+     uxtb16  r9, r7              ; [p2 | p0] (A)
+     uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+     uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+ 
+     usub16  r6, r8, r9          ; [d2 | d0] (A)
+     usub16  r7, r10, r11        ; [d3 | d1] (A)
+ 
+     ldr     r10, [r2, #4]       ; vsrc      (B)
+     ldr     r11, [r5, #4]       ; vpred     (B)
+ 
+     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+ 
+     str     r8, [r0], #4        ; diff      (A)
+     uxtb16  r8, r10             ; [s2 | s0] (B)
+     str     r9, [r0], #4        ; diff      (A)
+ 
+     uxtb16  r9, r11             ; [p2 | p0] (B)
+     uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+     uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+ 
+     usub16  r6, r8, r9          ; [d2 | d0] (B)
+     usub16  r7, r10, r11        ; [d3 | d1] (B)
+ 
+     add     r2, r2, r3          ; update vsrc pointer
+     add     r5, r5, r12         ; update vpred pointer
+ 
+     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+ 
+     str     r8, [r0], #4        ; diff      (B)
+     subs    r4, r4, #1          ; update loop counter
+     str     r9, [r0], #4        ; diff      (B)
+ 
+     bne     loop_v
+ 
+     ldmfd   sp!, {r4-r11}
+     bx      lr
+ 
+     ENDP
+ 
+ 
+ ; r0    short *diff
+ ; r1    unsigned char *src
+ ; r2    int src_stride
+ ; r3    unsigned char *pred
+ ; sp    int pred_stride
+ |vp8_subtract_mby_armv6| PROC
+ 
+     stmfd   sp!, {r4-r11}
+     ldr     r12, [sp, #32]      ; pred_stride
+     mov     r4, #16
+ loop
+     ldr     r6, [r1]            ; src       (A)
+     ldr     r7, [r3]            ; pred      (A)
+ 
+     uxtb16  r8, r6              ; [s2 | s0] (A)
+     uxtb16  r9, r7              ; [p2 | p0] (A)
+     uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+     uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+ 
+     usub16  r6, r8, r9          ; [d2 | d0] (A)
+     usub16  r7, r10, r11        ; [d3 | d1] (A)
+ 
+     ldr     r10, [r1, #4]       ; src       (B)
+     ldr     r11, [r3, #4]       ; pred      (B)
+ 
+     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+ 
+     str     r8, [r0], #4        ; diff      (A)
+     uxtb16  r8, r10             ; [s2 | s0] (B)
+     str     r9, [r0], #4        ; diff      (A)
+ 
+     uxtb16  r9, r11             ; [p2 | p0] (B)
+     uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+     uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+ 
+     usub16  r6, r8, r9          ; [d2 | d0] (B)
+     usub16  r7, r10, r11        ; [d3 | d1] (B)
+ 
+     ldr     r10, [r1, #8]       ; src       (C)
+     ldr     r11, [r3, #8]       ; pred      (C)
+ 
+     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+ 
+     str     r8, [r0], #4        ; diff      (B)
+     uxtb16  r8, r10             ; [s2 | s0] (C)
+     str     r9, [r0], #4        ; diff      (B)
+ 
+     uxtb16  r9, r11             ; [p2 | p0] (C)
+     uxtb16  r10, r10, ror #8    ; [s3 | s1] (C)
+     uxtb16  r11, r11, ror #8    ; [p3 | p1] (C)
+ 
+     usub16  r6, r8, r9          ; [d2 | d0] (C)
+     usub16  r7, r10, r11        ; [d3 | d1] (C)
+ 
+     ldr     r10, [r1, #12]      ; src       (D)
+     ldr     r11, [r3, #12]      ; pred      (D)
+ 
+     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (C)
+     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (C)
+ 
+     str     r8, [r0], #4        ; diff      (C)
+     uxtb16  r8, r10             ; [s2 | s0] (D)
+     str     r9, [r0], #4        ; diff      (C)
+ 
+     uxtb16  r9, r11             ; [p2 | p0] (D)
+     uxtb16  r10, r10, ror #8    ; [s3 | s1] (D)
+     uxtb16  r11, r11, ror #8    ; [p3 | p1] (D)
+ 
+     usub16  r6, r8, r9          ; [d2 | d0] (D)
+     usub16  r7, r10, r11        ; [d3 | d1] (D)
+ 
+     add     r1, r1, r2          ; update src pointer
+     add     r3, r3, r12         ; update pred pointer
+ 
+     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (D)
+     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (D)
+ 
+     str     r8, [r0], #4        ; diff      (D)
+     subs    r4, r4, #1          ; update loop counter
+     str     r9, [r0], #4        ; diff      (D)
+ 
+     bne     loop
+ 
+     ldmfd   sp!, {r4-r11}
+     bx      lr
+ 
+     ENDP
+ 
+     END
+ 
diff --cc vp8/encoder/arm/neon/fastquantizeb_neon.asm
index 000000000,143058842..9374310e5
mode 000000,100644..100644
--- a/vp8/encoder/arm/neon/fastquantizeb_neon.asm
+++ b/vp8/encoder/arm/neon/fastquantizeb_neon.asm
@@@ -1,0 -1,258 +1,258 @@@
+ ;
+ ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ ;
+ ;  Use of this source code is governed by a BSD-style license
+ ;  that can be found in the LICENSE file in the root of the source
+ ;  tree. An additional intellectual property rights grant can be found
+ ;  in the file PATENTS.  All contributing project authors may
+ ;  be found in the AUTHORS file in the root of the source tree.
+ ;
+ 
+ 
+     EXPORT  |vp8_fast_quantize_b_neon|
+     EXPORT  |vp8_fast_quantize_b_pair_neon|
+ 
 -    INCLUDE asm_enc_offsets.asm
++    INCLUDE vp8_asm_enc_offsets.asm
+ 
+     ARM
+     REQUIRE8
+     PRESERVE8
+ 
+     AREA ||.text||, CODE, READONLY, ALIGN=4
+ 
+ ;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
+ |vp8_fast_quantize_b_pair_neon| PROC
+ 
+     stmfd           sp!, {r4-r9}
+     vstmdb          sp!, {q4-q7}
+ 
+     ldr             r4, [r0, #vp8_block_coeff]
+     ldr             r5, [r0, #vp8_block_quant_fast]
+     ldr             r6, [r0, #vp8_block_round]
+ 
+     vld1.16         {q0, q1}, [r4@128]  ; load z
+ 
+     ldr             r7, [r2, #vp8_blockd_qcoeff]
+ 
+     vabs.s16        q4, q0              ; calculate x = abs(z)
+     vabs.s16        q5, q1
+ 
+     ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+     vshr.s16        q2, q0, #15         ; sz
+     vshr.s16        q3, q1, #15
+ 
+     vld1.s16        {q6, q7}, [r6@128]  ; load round_ptr [0-15]
+     vld1.s16        {q8, q9}, [r5@128]  ; load quant_ptr [0-15]
+ 
+     ldr             r4, [r1, #vp8_block_coeff]
+ 
+     vadd.s16        q4, q6              ; x + Round
+     vadd.s16        q5, q7
+ 
+     vld1.16         {q0, q1}, [r4@128]  ; load z2
+ 
+     vqdmulh.s16     q4, q8              ; y = ((Round+abs(z)) * Quant) >> 16
+     vqdmulh.s16     q5, q9
+ 
+     vabs.s16        q10, q0             ; calculate x2 = abs(z_2)
+     vabs.s16        q11, q1
+     vshr.s16        q12, q0, #15        ; sz2
+     vshr.s16        q13, q1, #15
+ 
+     ;modify data to have its original sign
+     veor.s16        q4, q2              ; y^sz
+     veor.s16        q5, q3
+ 
+     vadd.s16        q10, q6             ; x2 + Round
+     vadd.s16        q11, q7
+ 
+     ldr             r8, [r2, #vp8_blockd_dequant]
+ 
+     vqdmulh.s16     q10, q8             ; y2 = ((Round+abs(z)) * Quant) >> 16
+     vqdmulh.s16     q11, q9
+ 
+     vshr.s16        q4, #1              ; right shift 1 after vqdmulh
+     vshr.s16        q5, #1
+ 
+     vld1.s16        {q6, q7}, [r8@128]  ;load dequant_ptr[i]
+ 
+     vsub.s16        q4, q2              ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+     vsub.s16        q5, q3
+ 
+     vshr.s16        q10, #1             ; right shift 1 after vqdmulh
+     vshr.s16        q11, #1
+ 
+     ldr             r9, [r2, #vp8_blockd_dqcoeff]
+ 
+     veor.s16        q10, q12            ; y2^sz2
+     veor.s16        q11, q13
+ 
+     vst1.s16        {q4, q5}, [r7]      ; store: qcoeff = x1
+ 
+ 
+     vsub.s16        q10, q12            ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+     vsub.s16        q11, q13
+ 
+     ldr             r6, [r3, #vp8_blockd_qcoeff]
+ 
+     vmul.s16        q2, q6, q4          ; x * Dequant
+     vmul.s16        q3, q7, q5
+ 
+     adr             r0, inv_zig_zag     ; load ptr of inverse zigzag table
+ 
+     vceq.s16        q8, q8              ; set q8 to all 1
+ 
+     vst1.s16        {q10, q11}, [r6]    ; store: qcoeff = x2
+ 
+     vmul.s16        q12, q6, q10        ; x2 * Dequant
+     vmul.s16        q13, q7, q11
+ 
+     vld1.16         {q6, q7}, [r0@128]  ; load inverse scan order
+ 
+     vtst.16         q14, q4, q8         ; now find eob
+     vtst.16         q15, q5, q8         ; non-zero element is set to all 1
+ 
+     vst1.s16        {q2, q3}, [r9]      ; store dqcoeff = x * Dequant
+ 
+     ldr             r7, [r3, #vp8_blockd_dqcoeff]
+ 
+     vand            q0, q6, q14         ; get all valid numbers from scan array
+     vand            q1, q7, q15
+ 
+     vst1.s16        {q12, q13}, [r7]    ; store dqcoeff = x * Dequant
+ 
+     vtst.16         q2, q10, q8         ; now find eob
+     vtst.16         q3, q11, q8         ; non-zero element is set to all 1
+ 
+     vmax.u16        q0, q0, q1          ; find maximum value in q0, q1
+ 
+     vand            q10, q6, q2         ; get all valid numbers from scan array
+     vand            q11, q7, q3
+     vmax.u16        q10, q10, q11       ; find maximum value in q10, q11
+ 
+     vmax.u16        d0, d0, d1
+     vmax.u16        d20, d20, d21
+     vmovl.u16       q0, d0
+     vmovl.u16       q10, d20
+ 
+     vmax.u32        d0, d0, d1
+     vmax.u32        d20, d20, d21
+     vpmax.u32       d0, d0, d0
+     vpmax.u32       d20, d20, d20
+ 
+     ldr             r4, [r2, #vp8_blockd_eob]
+     ldr             r5, [r3, #vp8_blockd_eob]
+ 
+     vst1.8          {d0[0]}, [r4]       ; store eob
+     vst1.8          {d20[0]}, [r5]      ; store eob
+ 
+     vldmia          sp!, {q4-q7}
+     ldmfd           sp!, {r4-r9}
+     bx              lr
+ 
+     ENDP
+ 
+ ;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+ |vp8_fast_quantize_b_neon| PROC
+ 
+     stmfd           sp!, {r4-r7}
+ 
+     ldr             r3, [r0, #vp8_block_coeff]
+     ldr             r4, [r0, #vp8_block_quant_fast]
+     ldr             r5, [r0, #vp8_block_round]
+ 
+     vld1.16         {q0, q1}, [r3@128]  ; load z
+     vorr.s16        q14, q0, q1         ; check if all zero (step 1)
+     ldr             r6, [r1, #vp8_blockd_qcoeff]
+     ldr             r7, [r1, #vp8_blockd_dqcoeff]
+     vorr.s16        d28, d28, d29       ; check if all zero (step 2)
+ 
+     vabs.s16        q12, q0             ; calculate x = abs(z)
+     vabs.s16        q13, q1
+ 
+     ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+     vshr.s16        q2, q0, #15         ; sz
+     vmov            r2, r3, d28         ; check if all zero (step 3)
+     vshr.s16        q3, q1, #15
+ 
+     vld1.s16        {q14, q15}, [r5@128]; load round_ptr [0-15]
+     vld1.s16        {q8, q9}, [r4@128]  ; load quant_ptr [0-15]
+ 
+     vadd.s16        q12, q14            ; x + Round
+     vadd.s16        q13, q15
+ 
+     adr             r0, inv_zig_zag     ; load ptr of inverse zigzag table
+ 
+     vqdmulh.s16     q12, q8             ; y = ((Round+abs(z)) * Quant) >> 16
+     vqdmulh.s16     q13, q9
+ 
+     vld1.16         {q10, q11}, [r0@128]; load inverse scan order
+ 
+     vceq.s16        q8, q8              ; set q8 to all 1
+ 
+     ldr             r4, [r1, #vp8_blockd_dequant]
+ 
+     vshr.s16        q12, #1             ; right shift 1 after vqdmulh
+     vshr.s16        q13, #1
+ 
+     ldr             r5, [r1, #vp8_blockd_eob]
+ 
+     orr             r2, r2, r3          ; check if all zero (step 4)
+     cmp             r2, #0              ; check if all zero (step 5)
+     beq             zero_output         ; check if all zero (step 6)
+ 
+     ;modify data to have its original sign
+     veor.s16        q12, q2             ; y^sz
+     veor.s16        q13, q3
+ 
+     vsub.s16        q12, q2             ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+     vsub.s16        q13, q3
+ 
+     vld1.s16        {q2, q3}, [r4@128]  ; load dequant_ptr[i]
+ 
+     vtst.16         q14, q12, q8        ; now find eob
+     vtst.16         q15, q13, q8        ; non-zero element is set to all 1
+ 
+     vst1.s16        {q12, q13}, [r6@128]; store: qcoeff = x1
+ 
+     vand            q10, q10, q14       ; get all valid numbers from scan array
+     vand            q11, q11, q15
+ 
+ 
+     vmax.u16        q0, q10, q11        ; find maximum value in q0, q1
+     vmax.u16        d0, d0, d1
+     vmovl.u16       q0, d0
+ 
+     vmul.s16        q2, q12             ; x * Dequant
+     vmul.s16        q3, q13
+ 
+     vmax.u32        d0, d0, d1
+     vpmax.u32       d0, d0, d0
+ 
+     vst1.s16        {q2, q3}, [r7@128]  ; store dqcoeff = x * Dequant
+ 
+     vst1.8          {d0[0]}, [r5]       ; store eob
+ 
+     ldmfd           sp!, {r4-r7}
+     bx              lr
+ 
+ zero_output
+     strb            r2, [r5]            ; store eob
+     vst1.s16        {q0, q1}, [r6@128]  ; qcoeff = 0
+     vst1.s16        {q0, q1}, [r7@128]  ; dqcoeff = 0
+ 
+     ldmfd           sp!, {r4-r7}
+     bx              lr
+ 
+     ENDP
+ 
+ ; default inverse zigzag table is defined in vp8/common/entropy.c
+     ALIGN 16    ; enable use of @128 bit aligned loads
+ inv_zig_zag
+     DCW 0x0001, 0x0002, 0x0006, 0x0007
+     DCW 0x0003, 0x0005, 0x0008, 0x000d
+     DCW 0x0004, 0x0009, 0x000c, 0x000e
+     DCW 0x000a, 0x000b, 0x000f, 0x0010
+ 
+     END
+ 
diff --cc vp8/encoder/arm/neon/subtract_neon.asm
index 000000000,91a328c29..5bda78678
mode 000000,100644..100644
--- a/vp8/encoder/arm/neon/subtract_neon.asm
+++ b/vp8/encoder/arm/neon/subtract_neon.asm
@@@ -1,0 -1,199 +1,199 @@@
+ ;
+ ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ ;
+ ;  Use of this source code is governed by a BSD-style license
+ ;  that can be found in the LICENSE file in the root of the source
+ ;  tree. An additional intellectual property rights grant can be found
+ ;  in the file PATENTS.  All contributing project authors may
+ ;  be found in the AUTHORS file in the root of the source tree.
+ ;
+ 
+     EXPORT |vp8_subtract_b_neon|
+     EXPORT |vp8_subtract_mby_neon|
+     EXPORT |vp8_subtract_mbuv_neon|
+ 
 -    INCLUDE asm_enc_offsets.asm
++    INCLUDE vp8_asm_enc_offsets.asm
+ 
+     ARM
+     REQUIRE8
+     PRESERVE8
+ 
+     AREA ||.text||, CODE, READONLY, ALIGN=2
+ 
+ ;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
+ |vp8_subtract_b_neon| PROC
+ 
+     stmfd   sp!, {r4-r7}
+ 
+     ldr     r3, [r0, #vp8_block_base_src]
+     ldr     r4, [r0, #vp8_block_src]
+     ldr     r5, [r0, #vp8_block_src_diff]
+     ldr     r3, [r3]
+     ldr     r6, [r0, #vp8_block_src_stride]
+     add     r3, r3, r4                      ; src = *base_src + src
+     ldr     r7, [r1, #vp8_blockd_predictor]
+ 
+     vld1.8          {d0}, [r3], r6          ;load src
+     vld1.8          {d1}, [r7], r2          ;load pred
+     vld1.8          {d2}, [r3], r6
+     vld1.8          {d3}, [r7], r2
+     vld1.8          {d4}, [r3], r6
+     vld1.8          {d5}, [r7], r2
+     vld1.8          {d6}, [r3], r6
+     vld1.8          {d7}, [r7], r2
+ 
+     vsubl.u8        q10, d0, d1
+     vsubl.u8        q11, d2, d3
+     vsubl.u8        q12, d4, d5
+     vsubl.u8        q13, d6, d7
+ 
+     mov             r2, r2, lsl #1
+ 
+     vst1.16         {d20}, [r5], r2         ;store diff
+     vst1.16         {d22}, [r5], r2
+     vst1.16         {d24}, [r5], r2
+     vst1.16         {d26}, [r5], r2
+ 
+     ldmfd   sp!, {r4-r7}
+     bx              lr
+ 
+     ENDP
+ 
+ 
+ ;==========================================
+ ;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
+ ;                           unsigned char *pred, int pred_stride)
+ |vp8_subtract_mby_neon| PROC
+     push            {r4-r7}
+     mov             r12, #4
+     ldr             r4, [sp, #16]           ; pred_stride
+     mov             r6, #32                 ; "diff" stride x2
+     add             r5, r0, #16             ; second diff pointer
+ 
+ subtract_mby_loop
+     vld1.8          {q0}, [r1], r2          ;load src
+     vld1.8          {q1}, [r3], r4          ;load pred
+     vld1.8          {q2}, [r1], r2
+     vld1.8          {q3}, [r3], r4
+     vld1.8          {q4}, [r1], r2
+     vld1.8          {q5}, [r3], r4
+     vld1.8          {q6}, [r1], r2
+     vld1.8          {q7}, [r3], r4
+ 
+     vsubl.u8        q8, d0, d2
+     vsubl.u8        q9, d1, d3
+     vsubl.u8        q10, d4, d6
+     vsubl.u8        q11, d5, d7
+     vsubl.u8        q12, d8, d10
+     vsubl.u8        q13, d9, d11
+     vsubl.u8        q14, d12, d14
+     vsubl.u8        q15, d13, d15
+ 
+     vst1.16         {q8}, [r0], r6          ;store diff
+     vst1.16         {q9}, [r5], r6
+     vst1.16         {q10}, [r0], r6
+     vst1.16         {q11}, [r5], r6
+     vst1.16         {q12}, [r0], r6
+     vst1.16         {q13}, [r5], r6
+     vst1.16         {q14}, [r0], r6
+     vst1.16         {q15}, [r5], r6
+ 
+     subs            r12, r12, #1
+     bne             subtract_mby_loop
+ 
+     pop             {r4-r7}
+     bx              lr
+     ENDP
+ 
+ ;=================================
+ ;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
+ ;                         int src_stride, unsigned char *upred,
+ ;                         unsigned char *vpred, int pred_stride)
+ 
+ |vp8_subtract_mbuv_neon| PROC
+     push            {r4-r7}
+     ldr             r4, [sp, #16]       ; upred
+     ldr             r5, [sp, #20]       ; vpred
+     ldr             r6, [sp, #24]       ; pred_stride
+     add             r0, r0, #512        ; short *udiff = diff + 256;
+     mov             r12, #32            ; "diff" stride x2
+     add             r7, r0, #16         ; second diff pointer
+ 
+ ;u
+     vld1.8          {d0}, [r1], r3      ;load usrc
+     vld1.8          {d1}, [r4], r6      ;load upred
+     vld1.8          {d2}, [r1], r3
+     vld1.8          {d3}, [r4], r6
+     vld1.8          {d4}, [r1], r3
+     vld1.8          {d5}, [r4], r6
+     vld1.8          {d6}, [r1], r3
+     vld1.8          {d7}, [r4], r6
+     vld1.8          {d8}, [r1], r3
+     vld1.8          {d9}, [r4], r6
+     vld1.8          {d10}, [r1], r3
+     vld1.8          {d11}, [r4], r6
+     vld1.8          {d12}, [r1], r3
+     vld1.8          {d13}, [r4], r6
+     vld1.8          {d14}, [r1], r3
+     vld1.8          {d15}, [r4], r6
+ 
+     vsubl.u8        q8, d0, d1
+     vsubl.u8        q9, d2, d3
+     vsubl.u8        q10, d4, d5
+     vsubl.u8        q11, d6, d7
+     vsubl.u8        q12, d8, d9
+     vsubl.u8        q13, d10, d11
+     vsubl.u8        q14, d12, d13
+     vsubl.u8        q15, d14, d15
+ 
+     vst1.16         {q8}, [r0], r12     ;store diff
+     vst1.16         {q9}, [r7], r12
+     vst1.16         {q10}, [r0], r12
+     vst1.16         {q11}, [r7], r12
+     vst1.16         {q12}, [r0], r12
+     vst1.16         {q13}, [r7], r12
+     vst1.16         {q14}, [r0], r12
+     vst1.16         {q15}, [r7], r12
+ 
+ ;v
+     vld1.8          {d0}, [r2], r3      ;load vsrc
+     vld1.8          {d1}, [r5], r6      ;load vpred
+     vld1.8          {d2}, [r2], r3
+     vld1.8          {d3}, [r5], r6
+     vld1.8          {d4}, [r2], r3
+     vld1.8          {d5}, [r5], r6
+     vld1.8          {d6}, [r2], r3
+     vld1.8          {d7}, [r5], r6
+     vld1.8          {d8}, [r2], r3
+     vld1.8          {d9}, [r5], r6
+     vld1.8          {d10}, [r2], r3
+     vld1.8          {d11}, [r5], r6
+     vld1.8          {d12}, [r2], r3
+     vld1.8          {d13}, [r5], r6
+     vld1.8          {d14}, [r2], r3
+     vld1.8          {d15}, [r5], r6
+ 
+     vsubl.u8        q8, d0, d1
+     vsubl.u8        q9, d2, d3
+     vsubl.u8        q10, d4, d5
+     vsubl.u8        q11, d6, d7
+     vsubl.u8        q12, d8, d9
+     vsubl.u8        q13, d10, d11
+     vsubl.u8        q14, d12, d13
+     vsubl.u8        q15, d14, d15
+ 
+     vst1.16         {q8}, [r0], r12     ;store diff
+     vst1.16         {q9}, [r7], r12
+     vst1.16         {q10}, [r0], r12
+     vst1.16         {q11}, [r7], r12
+     vst1.16         {q12}, [r0], r12
+     vst1.16         {q13}, [r7], r12
+     vst1.16         {q14}, [r0], r12
+     vst1.16         {q15}, [r7], r12
+ 
+     pop             {r4-r7}
+     bx              lr
+ 
+     ENDP
+ 
+     END
diff --cc vp8/encoder/x86/quantize_sse2.asm
index 000000000,724e54c45..fe9464b3d
mode 000000,100644..100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@@ -1,0 -1,386 +1,386 @@@
+ ;
+ ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ ;
+ ;  Use of this source code is governed by a BSD-style license and patent
+ ;  grant that can be found in the LICENSE file in the root of the source
+ ;  tree. All contributing project authors may be found in the AUTHORS
+ ;  file in the root of the source tree.
+ ;
+ 
+ 
+ %include "vpx_ports/x86_abi_support.asm"
 -%include "asm_enc_offsets.asm"
++%include "vp8_asm_enc_offsets.asm"
+ 
+ 
+ ; void vp8_regular_quantize_b_sse2 | arg
+ ;  (BLOCK  *b,                     |  0
+ ;   BLOCKD *d)                     |  1
+ 
+ global sym(vp8_regular_quantize_b_sse2) PRIVATE
+ sym(vp8_regular_quantize_b_sse2):
+     push        rbp
+     mov         rbp, rsp
+     SAVE_XMM 7
+     GET_GOT     rbx
+ 
+ %if ABI_IS_32BIT
+     push        rdi
+     push        rsi
+ %else
+   %ifidn __OUTPUT_FORMAT__,x64
+     push        rdi
+     push        rsi
+   %endif
+ %endif
+ 
+     ALIGN_STACK 16, rax
+     %define zrun_zbin_boost   0  ;  8
+     %define abs_minus_zbin    8  ; 32
+     %define temp_qcoeff       40 ; 32
+     %define qcoeff            72 ; 32
+     %define stack_size        104
+     sub         rsp, stack_size
+     ; end prolog
+ 
+ %if ABI_IS_32BIT
+     mov         rdi, arg(0)                 ; BLOCK *b
+     mov         rsi, arg(1)                 ; BLOCKD *d
+ %else
+   %ifidn __OUTPUT_FORMAT__,x64
+     mov         rdi, rcx                    ; BLOCK *b
+     mov         rsi, rdx                    ; BLOCKD *d
+   %else
+     ;mov         rdi, rdi                    ; BLOCK *b
+     ;mov         rsi, rsi                    ; BLOCKD *d
+   %endif
+ %endif
+ 
+     mov         rdx, [rdi + vp8_block_coeff] ; coeff_ptr
+     mov         rcx, [rdi + vp8_block_zbin] ; zbin_ptr
+     movd        xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value
+ 
+     ; z
+     movdqa      xmm0, [rdx]
+     movdqa      xmm4, [rdx + 16]
+     mov         rdx, [rdi + vp8_block_round] ; round_ptr
+ 
+     pshuflw     xmm7, xmm7, 0
+     punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
+ 
+     movdqa      xmm1, xmm0
+     movdqa      xmm5, xmm4
+ 
+     ; sz
+     psraw       xmm0, 15
+     psraw       xmm4, 15
+ 
+     ; (z ^ sz)
+     pxor        xmm1, xmm0
+     pxor        xmm5, xmm4
+ 
+     ; x = abs(z)
+     psubw       xmm1, xmm0
+     psubw       xmm5, xmm4
+ 
+     movdqa      xmm2, [rcx]
+     movdqa      xmm3, [rcx + 16]
+     mov         rcx, [rdi + vp8_block_quant] ; quant_ptr
+ 
+     ; *zbin_ptr + zbin_oq_value
+     paddw       xmm2, xmm7
+     paddw       xmm3, xmm7
+ 
+     ; x - (*zbin_ptr + zbin_oq_value)
+     psubw       xmm1, xmm2
+     psubw       xmm5, xmm3
+     movdqa      [rsp + abs_minus_zbin], xmm1
+     movdqa      [rsp + abs_minus_zbin + 16], xmm5
+ 
+     ; add (zbin_ptr + zbin_oq_value) back
+     paddw       xmm1, xmm2
+     paddw       xmm5, xmm3
+ 
+     movdqa      xmm2, [rdx]
+     movdqa      xmm6, [rdx + 16]
+ 
+     movdqa      xmm3, [rcx]
+     movdqa      xmm7, [rcx + 16]
+ 
+     ; x + round
+     paddw       xmm1, xmm2
+     paddw       xmm5, xmm6
+ 
+     ; y = x * quant_ptr >> 16
+     pmulhw      xmm3, xmm1
+     pmulhw      xmm7, xmm5
+ 
+     ; y += x
+     paddw       xmm1, xmm3
+     paddw       xmm5, xmm7
+ 
+     movdqa      [rsp + temp_qcoeff], xmm1
+     movdqa      [rsp + temp_qcoeff + 16], xmm5
+ 
+     pxor        xmm6, xmm6
+     ; zero qcoeff
+     movdqa      [rsp + qcoeff], xmm6
+     movdqa      [rsp + qcoeff + 16], xmm6
+ 
+     mov         rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
+     mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
+     mov         [rsp + zrun_zbin_boost], rdx
+ 
+ %macro ZIGZAG_LOOP 1
+     ; x
+     movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
+ 
+     ; if (x >= zbin)
+     sub         cx, WORD PTR[rdx]           ; x - zbin
+     lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
+     jl          .rq_zigzag_loop_%1           ; x < zbin
+ 
+     movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
+ 
+     ; downshift by quant_shift[rc]
+     movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
+     sar         edi, cl                     ; also sets Z bit
+     je          .rq_zigzag_loop_%1           ; !y
+     mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+     mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
+ .rq_zigzag_loop_%1:
+ %endmacro
+ ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
+ ZIGZAG_LOOP  0
+ ZIGZAG_LOOP  1
+ ZIGZAG_LOOP  4
+ ZIGZAG_LOOP  8
+ ZIGZAG_LOOP  5
+ ZIGZAG_LOOP  2
+ ZIGZAG_LOOP  3
+ ZIGZAG_LOOP  6
+ ZIGZAG_LOOP  9
+ ZIGZAG_LOOP 12
+ ZIGZAG_LOOP 13
+ ZIGZAG_LOOP 10
+ ZIGZAG_LOOP  7
+ ZIGZAG_LOOP 11
+ ZIGZAG_LOOP 14
+ ZIGZAG_LOOP 15
+ 
+     movdqa      xmm2, [rsp + qcoeff]
+     movdqa      xmm3, [rsp + qcoeff + 16]
+ 
+     mov         rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr
+     mov         rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
+ 
+     ; y ^ sz
+     pxor        xmm2, xmm0
+     pxor        xmm3, xmm4
+     ; x = (y ^ sz) - sz
+     psubw       xmm2, xmm0
+     psubw       xmm3, xmm4
+ 
+     ; dequant
+     movdqa      xmm0, [rcx]
+     movdqa      xmm1, [rcx + 16]
+ 
+     mov         rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr
+ 
+     pmullw      xmm0, xmm2
+     pmullw      xmm1, xmm3
+ 
+     movdqa      [rcx], xmm2        ; store qcoeff
+     movdqa      [rcx + 16], xmm3
+     movdqa      [rdi], xmm0        ; store dqcoeff
+     movdqa      [rdi + 16], xmm1
+ 
+     mov         rcx, [rsi + vp8_blockd_eob]
+ 
+     ; select the last value (in zig_zag order) for EOB
+     pcmpeqw     xmm2, xmm6
+     pcmpeqw     xmm3, xmm6
+     ; !
+     pcmpeqw     xmm6, xmm6
+     pxor        xmm2, xmm6
+     pxor        xmm3, xmm6
+     ; mask inv_zig_zag
+     pand        xmm2, [GLOBAL(inv_zig_zag)]
+     pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
+     ; select the max value
+     pmaxsw      xmm2, xmm3
+     pshufd      xmm3, xmm2, 00001110b
+     pmaxsw      xmm2, xmm3
+     pshuflw     xmm3, xmm2, 00001110b
+     pmaxsw      xmm2, xmm3
+     pshuflw     xmm3, xmm2, 00000001b
+     pmaxsw      xmm2, xmm3
+     movd        eax, xmm2
+     and         eax, 0xff
+ 
+     mov         BYTE PTR [rcx], al          ; store eob
+ 
+     ; begin epilog
+     add         rsp, stack_size
+     pop         rsp
+ %if ABI_IS_32BIT
+     pop         rsi
+     pop         rdi
+ %else
+   %ifidn __OUTPUT_FORMAT__,x64
+     pop         rsi
+     pop         rdi
+   %endif
+ %endif
+     RESTORE_GOT
+     RESTORE_XMM
+     pop         rbp
+     ret
+ 
+ ; void vp8_fast_quantize_b_sse2 | arg
+ ;  (BLOCK  *b,                  |  0
+ ;   BLOCKD *d)                  |  1
+ 
+ global sym(vp8_fast_quantize_b_sse2) PRIVATE
+ sym(vp8_fast_quantize_b_sse2):
+     push        rbp
+     mov         rbp, rsp
+     GET_GOT     rbx
+ 
+ %if ABI_IS_32BIT
+     push        rdi
+     push        rsi
+ %else
+   %ifidn __OUTPUT_FORMAT__,x64
+     push        rdi
+     push        rsi
+   %else
+     ; these registers are used for passing arguments
+   %endif
+ %endif
+ 
+     ; end prolog
+ 
+ %if ABI_IS_32BIT
+     mov         rdi, arg(0)                 ; BLOCK *b
+     mov         rsi, arg(1)                 ; BLOCKD *d
+ %else
+   %ifidn __OUTPUT_FORMAT__,x64
+     mov         rdi, rcx                    ; BLOCK *b
+     mov         rsi, rdx                    ; BLOCKD *d
+   %else
+     ;mov         rdi, rdi                    ; BLOCK *b
+     ;mov         rsi, rsi                    ; BLOCKD *d
+   %endif
+ %endif
+ 
+     mov         rax, [rdi + vp8_block_coeff]
+     mov         rcx, [rdi + vp8_block_round]
+     mov         rdx, [rdi + vp8_block_quant_fast]
+ 
+     ; z = coeff
+     movdqa      xmm0, [rax]
+     movdqa      xmm4, [rax + 16]
+ 
+     ; dup z so we can save sz
+     movdqa      xmm1, xmm0
+     movdqa      xmm5, xmm4
+ 
+     ; sz = z >> 15
+     psraw       xmm0, 15
+     psraw       xmm4, 15
+ 
+     ; x = abs(z) = (z ^ sz) - sz
+     pxor        xmm1, xmm0
+     pxor        xmm5, xmm4
+     psubw       xmm1, xmm0
+     psubw       xmm5, xmm4
+ 
+     ; x += round
+     paddw       xmm1, [rcx]
+     paddw       xmm5, [rcx + 16]
+ 
+     mov         rax, [rsi + vp8_blockd_qcoeff]
+     mov         rcx, [rsi + vp8_blockd_dequant]
+     mov         rdi, [rsi + vp8_blockd_dqcoeff]
+ 
+     ; y = x * quant >> 16
+     pmulhw      xmm1, [rdx]
+     pmulhw      xmm5, [rdx + 16]
+ 
+     ; x = (y ^ sz) - sz
+     pxor        xmm1, xmm0
+     pxor        xmm5, xmm4
+     psubw       xmm1, xmm0
+     psubw       xmm5, xmm4
+ 
+     ; qcoeff = x
+     movdqa      [rax], xmm1
+     movdqa      [rax + 16], xmm5
+ 
+     ; x * dequant
+     movdqa      xmm2, xmm1
+     movdqa      xmm3, xmm5
+     pmullw      xmm2, [rcx]
+     pmullw      xmm3, [rcx + 16]
+ 
+     ; dqcoeff = x * dequant
+     movdqa      [rdi], xmm2
+     movdqa      [rdi + 16], xmm3
+ 
+     pxor        xmm4, xmm4                  ;clear all bits
+     pcmpeqw     xmm1, xmm4
+     pcmpeqw     xmm5, xmm4
+ 
+     pcmpeqw     xmm4, xmm4                  ;set all bits
+     pxor        xmm1, xmm4
+     pxor        xmm5, xmm4
+ 
+     pand        xmm1, [GLOBAL(inv_zig_zag)]
+     pand        xmm5, [GLOBAL(inv_zig_zag + 16)]
+ 
+     pmaxsw      xmm1, xmm5
+ 
+     mov         rcx, [rsi + vp8_blockd_eob]
+ 
+     ; now down to 8
+     pshufd      xmm5, xmm1, 00001110b
+ 
+     pmaxsw      xmm1, xmm5
+ 
+     ; only 4 left
+     pshuflw     xmm5, xmm1, 00001110b
+ 
+     pmaxsw      xmm1, xmm5
+ 
+     ; okay, just 2!
+     pshuflw     xmm5, xmm1, 00000001b
+ 
+     pmaxsw      xmm1, xmm5
+ 
+     movd        eax, xmm1
+     and         eax, 0xff
+ 
+     mov         BYTE PTR [rcx], al          ; store eob
+ 
+     ; begin epilog
+ %if ABI_IS_32BIT
+     pop         rsi
+     pop         rdi
+ %else
+   %ifidn __OUTPUT_FORMAT__,x64
+     pop         rsi
+     pop         rdi
+   %endif
+ %endif
+ 
+     RESTORE_GOT
+     pop         rbp
+     ret
+ 
+ SECTION_RODATA
+ align 16
+ inv_zig_zag:
+   dw 0x0001, 0x0002, 0x0006, 0x0007
+   dw 0x0003, 0x0005, 0x0008, 0x000d
+   dw 0x0004, 0x0009, 0x000c, 0x000e
+   dw 0x000a, 0x000b, 0x000f, 0x0010
diff --cc vp8/encoder/x86/quantize_sse4.asm
index 000000000,f0e5d407e..f21146457
mode 000000,100644..100644
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@@ -1,0 -1,256 +1,256 @@@
+ ;
+ ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ ;
+ ;  Use of this source code is governed by a BSD-style license and patent
+ ;  grant that can be found in the LICENSE file in the root of the source
+ ;  tree. All contributing project authors may be found in the AUTHORS
+ ;  file in the root of the source tree.
+ ;
+ 
+ 
+ %include "vpx_ports/x86_abi_support.asm"
 -%include "asm_enc_offsets.asm"
++%include "vp8_asm_enc_offsets.asm"
+ 
+ 
+ ; void vp8_regular_quantize_b_sse4 | arg
+ ;  (BLOCK  *b,                     |  0
+ ;   BLOCKD *d)                     |  1
+ 
+ global sym(vp8_regular_quantize_b_sse4) PRIVATE
+ sym(vp8_regular_quantize_b_sse4):
+ 
+ %if ABI_IS_32BIT
+     push        rbp
+     mov         rbp, rsp
+     GET_GOT     rbx
+     push        rdi
+     push        rsi
+ 
+     ALIGN_STACK 16, rax
+     %define qcoeff      0 ; 32
+     %define stack_size 32
+     sub         rsp, stack_size
+ %else
+   %ifidn __OUTPUT_FORMAT__,x64
+     SAVE_XMM 8, u
+     push        rdi
+     push        rsi
+   %endif
+ %endif
+     ; end prolog
+ 
+ %if ABI_IS_32BIT
+     mov         rdi, arg(0)                 ; BLOCK *b
+     mov         rsi, arg(1)                 ; BLOCKD *d
+ %else
+   %ifidn __OUTPUT_FORMAT__,x64
+     mov         rdi, rcx                    ; BLOCK *b
+     mov         rsi, rdx                    ; BLOCKD *d
+   %else
+     ;mov         rdi, rdi                    ; BLOCK *b
+     ;mov         rsi, rsi                    ; BLOCKD *d
+   %endif
+ %endif
+ 
+     mov         rax, [rdi + vp8_block_coeff]
+     mov         rcx, [rdi + vp8_block_zbin]
+     mov         rdx, [rdi + vp8_block_round]
+     movd        xmm7, [rdi + vp8_block_zbin_extra]
+ 
+     ; z
+     movdqa      xmm0, [rax]
+     movdqa      xmm1, [rax + 16]
+ 
+     ; duplicate zbin_oq_value
+     pshuflw     xmm7, xmm7, 0
+     punpcklwd   xmm7, xmm7
+ 
+     movdqa      xmm2, xmm0
+     movdqa      xmm3, xmm1
+ 
+     ; sz
+     psraw       xmm0, 15
+     psraw       xmm1, 15
+ 
+     ; (z ^ sz)
+     pxor        xmm2, xmm0
+     pxor        xmm3, xmm1
+ 
+     ; x = abs(z)
+     psubw       xmm2, xmm0
+     psubw       xmm3, xmm1
+ 
+     ; zbin
+     movdqa      xmm4, [rcx]
+     movdqa      xmm5, [rcx + 16]
+ 
+     ; *zbin_ptr + zbin_oq_value
+     paddw       xmm4, xmm7
+     paddw       xmm5, xmm7
+ 
+     movdqa      xmm6, xmm2
+     movdqa      xmm7, xmm3
+ 
+     ; x - (*zbin_ptr + zbin_oq_value)
+     psubw       xmm6, xmm4
+     psubw       xmm7, xmm5
+ 
+     ; round
+     movdqa      xmm4, [rdx]
+     movdqa      xmm5, [rdx + 16]
+ 
+     mov         rax, [rdi + vp8_block_quant_shift]
+     mov         rcx, [rdi + vp8_block_quant]
+     mov         rdx, [rdi + vp8_block_zrun_zbin_boost]
+ 
+     ; x + round
+     paddw       xmm2, xmm4
+     paddw       xmm3, xmm5
+ 
+     ; quant
+     movdqa      xmm4, [rcx]
+     movdqa      xmm5, [rcx + 16]
+ 
+     ; y = x * quant_ptr >> 16
+     pmulhw      xmm4, xmm2
+     pmulhw      xmm5, xmm3
+ 
+     ; y += x
+     paddw       xmm2, xmm4
+     paddw       xmm3, xmm5
+ 
+     pxor        xmm4, xmm4
+ %if ABI_IS_32BIT
+     movdqa      [rsp + qcoeff], xmm4
+     movdqa      [rsp + qcoeff + 16], xmm4
+ %else
+     pxor        xmm8, xmm8
+ %endif
+ 
+     ; quant_shift
+     movdqa      xmm5, [rax]
+ 
+     ; zrun_zbin_boost
+     mov         rax, rdx
+ 
+ %macro ZIGZAG_LOOP 5
+     ; x
+     pextrw      ecx, %4, %2
+ 
+     ; if (x >= zbin)
+     sub         cx, WORD PTR[rdx]           ; x - zbin
+     lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
+     jl          .rq_zigzag_loop_%1          ; x < zbin
+ 
+     pextrw      edi, %3, %2                 ; y
+ 
+     ; downshift by quant_shift[rc]
+     pextrb      ecx, xmm5, %1               ; quant_shift[rc]
+     sar         edi, cl                     ; also sets Z bit
+     je          .rq_zigzag_loop_%1          ; !y
+ %if ABI_IS_32BIT
+     mov         WORD PTR[rsp + qcoeff + %1 *2], di
+ %else
+     pinsrw      %5, edi, %2                 ; qcoeff[rc]
+ %endif
+     mov         rdx, rax                    ; reset to b->zrun_zbin_boost
+ .rq_zigzag_loop_%1:
+ %endmacro
+ ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
+ ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
+ ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
+ ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
+ ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
+ ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
+ ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
+ ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
+ ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
+ ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
+ ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
+ ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
+ ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
+ ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
+ ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
+ ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
+ ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
+ 
+     mov         rcx, [rsi + vp8_blockd_dequant]
+     mov         rdi, [rsi + vp8_blockd_dqcoeff]
+ 
+ %if ABI_IS_32BIT
+     movdqa      xmm4, [rsp + qcoeff]
+     movdqa      xmm5, [rsp + qcoeff + 16]
+ %else
+     %define     xmm5 xmm8
+ %endif
+ 
+     ; y ^ sz
+     pxor        xmm4, xmm0
+     pxor        xmm5, xmm1
+     ; x = (y ^ sz) - sz
+     psubw       xmm4, xmm0
+     psubw       xmm5, xmm1
+ 
+     ; dequant
+     movdqa      xmm0, [rcx]
+     movdqa      xmm1, [rcx + 16]
+ 
+     mov         rcx, [rsi + vp8_blockd_qcoeff]
+ 
+     pmullw      xmm0, xmm4
+     pmullw      xmm1, xmm5
+ 
+     ; store qcoeff
+     movdqa      [rcx], xmm4
+     movdqa      [rcx + 16], xmm5
+ 
+     ; store dqcoeff
+     movdqa      [rdi], xmm0
+     movdqa      [rdi + 16], xmm1
+ 
+     mov         rcx, [rsi + vp8_blockd_eob]
+ 
+     ; select the last value (in zig_zag order) for EOB
+     pxor        xmm6, xmm6
+     pcmpeqw     xmm4, xmm6
+     pcmpeqw     xmm5, xmm6
+ 
+     packsswb    xmm4, xmm5
+     pshufb      xmm4, [GLOBAL(zig_zag1d)]
+     pmovmskb    edx, xmm4
+     xor         rdi, rdi
+     mov         eax, -1
+     xor         dx, ax
+     bsr         eax, edx
+     sub         edi, edx
+     sar         edi, 31
+     add         eax, 1
+     and         eax, edi
+ 
+     mov         BYTE PTR [rcx], al          ; store eob
+ 
+     ; begin epilog
+ %if ABI_IS_32BIT
+     add         rsp, stack_size
+     pop         rsp
+ 
+     pop         rsi
+     pop         rdi
+     RESTORE_GOT
+     pop         rbp
+ %else
+   %undef xmm5
+   %ifidn __OUTPUT_FORMAT__,x64
+     pop         rsi
+     pop         rdi
+     RESTORE_XMM
+   %endif
+ %endif
+ 
+     ret
+ 
+ SECTION_RODATA
+ align 16
+ ; vp8/common/entropy.c: vp8_default_zig_zag1d
+ zig_zag1d:
+     db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --cc vp8/encoder/x86/quantize_ssse3.asm
index 000000000,dd526f4f1..35368894d
mode 000000,100644..100644
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@@ -1,0 -1,138 +1,138 @@@
+ ;
+ ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ ;
+ ;  Use of this source code is governed by a BSD-style license and patent
+ ;  grant that can be found in the LICENSE file in the root of the source
+ ;  tree. All contributing project authors may be found in the AUTHORS
+ ;  file in the root of the source tree.
+ ;
+ 
+ 
+ %include "vpx_ports/x86_abi_support.asm"
 -%include "asm_enc_offsets.asm"
++%include "vp8_asm_enc_offsets.asm"
+ 
+ 
+ ; void vp8_fast_quantize_b_ssse3 | arg
+ ;  (BLOCK  *b,                   |  0
+ ;   BLOCKD *d)                   |  1
+ ;
+ 
+ global sym(vp8_fast_quantize_b_ssse3) PRIVATE
+ sym(vp8_fast_quantize_b_ssse3):
+     push        rbp
+     mov         rbp, rsp
+     GET_GOT     rbx
+ 
+ %if ABI_IS_32BIT
+     push        rdi
+     push        rsi
+ %else
+   %ifidn __OUTPUT_FORMAT__,x64
+     push        rdi
+     push        rsi
+   %endif
+ %endif
+     ; end prolog
+ 
+ %if ABI_IS_32BIT
+     mov         rdi, arg(0)                 ; BLOCK *b
+     mov         rsi, arg(1)                 ; BLOCKD *d
+ %else
+   %ifidn __OUTPUT_FORMAT__,x64
+     mov         rdi, rcx                    ; BLOCK *b
+     mov         rsi, rdx                    ; BLOCKD *d
+   %else
+     ;mov         rdi, rdi                    ; BLOCK *b
+     ;mov         rsi, rsi                    ; BLOCKD *d
+   %endif
+ %endif
+ 
+     mov         rax, [rdi + vp8_block_coeff]
+     mov         rcx, [rdi + vp8_block_round]
+     mov         rdx, [rdi + vp8_block_quant_fast]
+ 
+     ; coeff
+     movdqa      xmm0, [rax]
+     movdqa      xmm4, [rax + 16]
+ 
+     ; round
+     movdqa      xmm2, [rcx]
+     movdqa      xmm3, [rcx + 16]
+ 
+     movdqa      xmm1, xmm0
+     movdqa      xmm5, xmm4
+ 
+     ; sz = z >> 15
+     psraw       xmm0, 15
+     psraw       xmm4, 15
+ 
+     pabsw       xmm1, xmm1
+     pabsw       xmm5, xmm5
+ 
+     paddw       xmm1, xmm2
+     paddw       xmm5, xmm3
+ 
+     ; quant_fast
+     pmulhw      xmm1, [rdx]
+     pmulhw      xmm5, [rdx + 16]
+ 
+     mov         rax, [rsi + vp8_blockd_qcoeff]
+     mov         rdi, [rsi + vp8_blockd_dequant]
+     mov         rcx, [rsi + vp8_blockd_dqcoeff]
+ 
+     movdqa      xmm2, xmm1                  ;store y for getting eob
+     movdqa      xmm3, xmm5
+ 
+     pxor        xmm1, xmm0
+     pxor        xmm5, xmm4
+     psubw       xmm1, xmm0
+     psubw       xmm5, xmm4
+ 
+     movdqa      [rax], xmm1
+     movdqa      [rax + 16], xmm5
+ 
+     movdqa      xmm0, [rdi]
+     movdqa      xmm4, [rdi + 16]
+ 
+     pmullw      xmm0, xmm1
+     pmullw      xmm4, xmm5
+     pxor        xmm1, xmm1
+ 
+     pcmpgtw     xmm2, xmm1                  ;calculate eob
+     pcmpgtw     xmm3, xmm1
+     packsswb    xmm2, xmm3
+     pshufb      xmm2, [GLOBAL(zz_shuf)]
+ 
+     pmovmskb    edx, xmm2
+ 
+     movdqa      [rcx], xmm0                 ;store dqcoeff
+     movdqa      [rcx + 16], xmm4            ;store dqcoeff
+     mov         rcx, [rsi + vp8_blockd_eob]
+ 
+     bsr         eax, edx                    ;count 0
+     add         eax, 1
+ 
+     cmp         edx, 0                      ;if all 0, eob=0
+     cmove       eax, edx
+ 
+     mov         BYTE PTR [rcx], al          ;store eob
+ 
+     ; begin epilog
+ %if ABI_IS_32BIT
+     pop         rsi
+     pop         rdi
+ %else
+   %ifidn __OUTPUT_FORMAT__,x64
+     pop         rsi
+     pop         rdi
+   %endif
+ %endif
+ 
+     RESTORE_GOT
+     pop         rbp
+     ret
+ 
+ SECTION_RODATA
+ align 16
+ zz_shuf:
+     db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --cc vp8/vp8_common.mk
index 000000000,a328f46c2..d54c2330c
mode 000000,100644..100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@@ -1,0 -1,193 +1,196 @@@
+ ##
+ ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ ##
+ ##  Use of this source code is governed by a BSD-style license
+ ##  that can be found in the LICENSE file in the root of the source
+ ##  tree. An additional intellectual property rights grant can be found
+ ##  in the file PATENTS.  All contributing project authors may
+ ##  be found in the AUTHORS file in the root of the source tree.
+ ##
+ 
+ VP8_COMMON_SRCS-yes += vp8_common.mk
+ VP8_COMMON_SRCS-yes += common/pragmas.h
+ VP8_COMMON_SRCS-yes += common/ppflags.h
+ VP8_COMMON_SRCS-yes += common/onyx.h
+ VP8_COMMON_SRCS-yes += common/onyxd.h
+ VP8_COMMON_SRCS-yes += common/alloccommon.c
+ VP8_COMMON_SRCS-yes += common/asm_com_offsets.c
+ VP8_COMMON_SRCS-yes += common/blockd.c
+ VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
+ VP8_COMMON_SRCS-yes += common/debugmodes.c
+ VP8_COMMON_SRCS-yes += common/default_coef_probs.h
+ VP8_COMMON_SRCS-yes += common/dequantize.c
+ VP8_COMMON_SRCS-yes += common/entropy.c
+ VP8_COMMON_SRCS-yes += common/entropymode.c
+ VP8_COMMON_SRCS-yes += common/entropymv.c
+ VP8_COMMON_SRCS-yes += common/extend.c
+ VP8_COMMON_SRCS-yes += common/filter.c
+ VP8_COMMON_SRCS-yes += common/filter.h
+ VP8_COMMON_SRCS-yes += common/findnearmv.c
+ VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
+ VP8_COMMON_SRCS-yes += common/idct_blk.c
+ VP8_COMMON_SRCS-yes += common/idctllm.c
+ VP8_COMMON_SRCS-yes += common/alloccommon.h
+ VP8_COMMON_SRCS-yes += common/blockd.h
+ VP8_COMMON_SRCS-yes += common/common.h
+ VP8_COMMON_SRCS-yes += common/entropy.h
+ VP8_COMMON_SRCS-yes += common/entropymode.h
+ VP8_COMMON_SRCS-yes += common/entropymv.h
+ VP8_COMMON_SRCS-yes += common/extend.h
+ VP8_COMMON_SRCS-yes += common/findnearmv.h
+ VP8_COMMON_SRCS-yes += common/header.h
+ VP8_COMMON_SRCS-yes += common/invtrans.h
+ VP8_COMMON_SRCS-yes += common/loopfilter.h
+ VP8_COMMON_SRCS-yes += common/modecont.h
+ VP8_COMMON_SRCS-yes += common/mv.h
+ VP8_COMMON_SRCS-yes += common/onyxc_int.h
+ VP8_COMMON_SRCS-yes += common/quant_common.h
+ VP8_COMMON_SRCS-yes += common/reconinter.h
+ VP8_COMMON_SRCS-yes += common/reconintra4x4.h
+ VP8_COMMON_SRCS-yes += common/rtcd.c
+ VP8_COMMON_SRCS-yes += common/rtcd_defs.sh
+ VP8_COMMON_SRCS-yes += common/setupintrarecon.h
+ VP8_COMMON_SRCS-yes += common/swapyv12buffer.h
+ VP8_COMMON_SRCS-yes += common/systemdependent.h
+ VP8_COMMON_SRCS-yes += common/threading.h
+ VP8_COMMON_SRCS-yes += common/treecoder.h
+ VP8_COMMON_SRCS-yes += common/loopfilter.c
+ VP8_COMMON_SRCS-yes += common/loopfilter_filters.c
+ VP8_COMMON_SRCS-yes += common/mbpitch.c
+ VP8_COMMON_SRCS-yes += common/modecont.c
+ VP8_COMMON_SRCS-yes += common/quant_common.c
+ VP8_COMMON_SRCS-yes += common/reconinter.c
+ VP8_COMMON_SRCS-yes += common/reconintra.c
+ VP8_COMMON_SRCS-yes += common/reconintra4x4.c
+ VP8_COMMON_SRCS-yes += common/sad_c.c
+ VP8_COMMON_SRCS-yes += common/setupintrarecon.c
+ VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
+ VP8_COMMON_SRCS-yes += common/variance_c.c
+ VP8_COMMON_SRCS-yes += common/variance.h
+ VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
+ 
+ 
+ 
+ VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
+ VP8_COMMON_SRCS-yes += common/treecoder.c
+ 
+ VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
+ VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h
+ VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
+ VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
+ VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c
+ VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
+ VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
+ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm
+ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c
+ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
+ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
+ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
+ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
+ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/sad_mmx.asm
+ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
+ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_mmx.c
+ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_impl_mmx.asm
+ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
+ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
+ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
+ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
+ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sad_sse2.asm
+ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
+ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
+ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
+ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_sse2.c
+ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm
+ VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/sad_sse3.asm
+ VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/sad_ssse3.asm
+ VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
+ VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c
+ VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm
+ VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/sad_sse4.asm
+ 
+ ifeq ($(CONFIG_POSTPROC),yes)
+ VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.c
+ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
+ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm
+ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
+ endif
+ 
+ ifeq ($(ARCH_X86_64),yes)
+ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_block_sse2.asm
+ endif
+ 
+ # common (c)
+ VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/idctllm_dspr2.c
+ VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/filter_dspr2.c
+ VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/loopfilter_filters_dspr2.c
+ VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/reconinter_dspr2.c
+ VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/idct_blk_dspr2.c
+ VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/dequantize_dspr2.c
+ 
+ # common (c)
+ VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c
+ VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c
+ VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/reconintra_arm.c
+ VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.c
+ VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/variance_arm.c
+ 
+ # common (media)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/bilinearfilter_arm.c
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/bilinearfilter_arm.h
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/bilinearfilter_v6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/copymem8x4_v6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/copymem8x8_v6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/copymem16x16_v6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dc_only_idct_add_v6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/iwalsh_v6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/filter_v6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_v6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/loopfilter_v6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/simpleloopfilter_v6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/sixtappredict8x4_v6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/intra4x4_predict_v6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequant_idct_v6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequantize_v6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_blk_v6.c
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_sad16x16_armv6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance8x8_armv6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance16x16_armv6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
+ 
+ # common (neon)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict4x4_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict8x4_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict8x8_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict16x16_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem8x4_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem8x8_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem16x16_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/iwalsh_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfilter_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/mbloopfilter_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sad8_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sad16_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict4x4_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict8x4_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict8x8_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict16x16_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/save_reg_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequant_idct_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_blk_neon.c
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/variance_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
+ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
++
++$(eval $(call asm_offsets_template,\
++         vp8_asm_com_offsets.asm, $(VP8_PREFIX)common/asm_com_offsets.c))
diff --cc vp8/vp8cx.mk
index 000000000,5976297bc..4ff3ef2c0
mode 000000,100644..100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@@ -1,0 -1,124 +1,125 @@@
+ ##
+ ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ ##
+ ##  Use of this source code is governed by a BSD-style license
+ ##  that can be found in the LICENSE file in the root of the source
+ ##  tree. An additional intellectual property rights grant can be found
+ ##  in the file PATENTS.  All contributing project authors may
+ ##  be found in the AUTHORS file in the root of the source tree.
+ ##
+ 
+ 
 -include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
 -
+ VP8_CX_EXPORTS += exports_enc
+ 
+ VP8_CX_SRCS-yes += $(VP8_COMMON_SRCS-yes)
+ VP8_CX_SRCS-no  += $(VP8_COMMON_SRCS-no)
+ VP8_CX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
+ VP8_CX_SRCS_REMOVE-no  += $(VP8_COMMON_SRCS_REMOVE-no)
+ 
+ ifeq ($(ARCH_ARM),yes)
+   include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx_arm.mk
+ endif
+ 
+ VP8_CX_SRCS-yes += vp8_cx_iface.c
+ 
+ # encoder
+ #INCLUDES += algo/vpx_common/vpx_mem/include
+ #INCLUDES += common
+ #INCLUDES += common
+ #INCLUDES += common
+ #INCLUDES += algo/vpx_ref/cpu_id/include
+ #INCLUDES += common
+ #INCLUDES += encoder
+ 
+ VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c
+ VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h
+ VP8_CX_SRCS-yes += encoder/bitstream.c
+ VP8_CX_SRCS-yes += encoder/boolhuff.c
+ VP8_CX_SRCS-yes += encoder/dct.c
+ VP8_CX_SRCS-yes += encoder/encodeframe.c
+ VP8_CX_SRCS-yes += encoder/encodeframe.h
+ VP8_CX_SRCS-yes += encoder/encodeintra.c
+ VP8_CX_SRCS-yes += encoder/encodemb.c
+ VP8_CX_SRCS-yes += encoder/encodemv.c
+ VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c
+ VP8_CX_SRCS-yes += encoder/firstpass.c
+ VP8_CX_SRCS-yes += encoder/block.h
+ VP8_CX_SRCS-yes += encoder/boolhuff.h
+ VP8_CX_SRCS-yes += encoder/bitstream.h
+ VP8_CX_SRCS-$(CONFIG_TEMPORAL_DENOISING) += encoder/denoising.h
+ VP8_CX_SRCS-$(CONFIG_TEMPORAL_DENOISING) += encoder/denoising.c
+ VP8_CX_SRCS-yes += encoder/encodeintra.h
+ VP8_CX_SRCS-yes += encoder/encodemb.h
+ VP8_CX_SRCS-yes += encoder/encodemv.h
+ VP8_CX_SRCS-yes += encoder/firstpass.h
+ VP8_CX_SRCS-yes += encoder/lookahead.c
+ VP8_CX_SRCS-yes += encoder/lookahead.h
+ VP8_CX_SRCS-yes += encoder/mcomp.h
+ VP8_CX_SRCS-yes += encoder/modecosts.h
+ VP8_CX_SRCS-yes += encoder/onyx_int.h
+ VP8_CX_SRCS-yes += encoder/pickinter.h
+ VP8_CX_SRCS-yes += encoder/psnr.h
+ VP8_CX_SRCS-yes += encoder/quantize.h
+ VP8_CX_SRCS-yes += encoder/ratectrl.h
+ VP8_CX_SRCS-yes += encoder/rdopt.h
+ VP8_CX_SRCS-yes += encoder/tokenize.h
+ VP8_CX_SRCS-yes += encoder/treewriter.h
+ VP8_CX_SRCS-yes += encoder/mcomp.c
+ VP8_CX_SRCS-yes += encoder/modecosts.c
+ VP8_CX_SRCS-yes += encoder/onyx_if.c
+ VP8_CX_SRCS-yes += encoder/pickinter.c
+ VP8_CX_SRCS-yes += encoder/picklpf.c
+ VP8_CX_SRCS-yes += encoder/psnr.c
+ VP8_CX_SRCS-yes += encoder/quantize.c
+ VP8_CX_SRCS-yes += encoder/ratectrl.c
+ VP8_CX_SRCS-yes += encoder/rdopt.c
+ VP8_CX_SRCS-yes += encoder/segmentation.c
+ VP8_CX_SRCS-yes += encoder/segmentation.h
+ VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/ssim.c
+ VP8_CX_SRCS-yes += encoder/tokenize.c
+ VP8_CX_SRCS-yes += encoder/dct_value_cost.h
+ VP8_CX_SRCS-yes += encoder/dct_value_tokens.h
+ VP8_CX_SRCS-yes += encoder/treewriter.c
+ VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
+ VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
+ VP8_CX_SRCS-yes += encoder/temporal_filter.c
+ VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
+ VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
+ 
+ ifeq ($(CONFIG_REALTIME_ONLY),yes)
+ VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
+ VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
+ endif
+ 
+ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
+ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
+ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
+ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
+ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
+ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
+ 
+ ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
+ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
+ ifeq ($(HAVE_SSE2),yes)
+ vp8/encoder/x86/denoising_sse2.c.o: CFLAGS += -msse2
+ endif
+ endif
+ 
+ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
+ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
+ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
+ VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
+ VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
+ VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
+ VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
+ VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm
+ 
+ ifeq ($(CONFIG_REALTIME_ONLY),yes)
+ VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
+ endif
+ 
+ 
+ VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
++
++$(eval $(call asm_offsets_template,\
++         vp8_asm_enc_offsets.asm, $(VP8_PREFIX)encoder/asm_enc_offsets.c))
diff --cc vp8/vp8dx.mk
index 000000000,2cfd280cb..5753e04e5
mode 000000,100644..100644
--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@@@ -1,0 -1,64 +1,65 @@@
+ ##
+ ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ ##
+ ##  Use of this source code is governed by a BSD-style license
+ ##  that can be found in the LICENSE file in the root of the source
+ ##  tree. An additional intellectual property rights grant can be found
+ ##  in the file PATENTS.  All contributing project authors may
+ ##  be found in the AUTHORS file in the root of the source tree.
+ ##
+ 
+ 
 -include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
 -
+ VP8_DX_EXPORTS += exports_dec
+ 
+ VP8_DX_SRCS-yes += $(VP8_COMMON_SRCS-yes)
+ VP8_DX_SRCS-no  += $(VP8_COMMON_SRCS-no)
+ VP8_DX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
+ VP8_DX_SRCS_REMOVE-no  += $(VP8_COMMON_SRCS_REMOVE-no)
+ 
+ VP8_DX_SRCS-yes += vp8_dx_iface.c
+ 
+ # common
+ #define ARM
+ #define DISABLE_THREAD
+ 
+ #INCLUDES += algo/vpx_common/vpx_mem/include
+ #INCLUDES += common
+ #INCLUDES += common
+ #INCLUDES += common
+ #INCLUDES += common
+ #INCLUDES += decoder
+ 
+ 
+ 
+ # decoder
+ #define ARM
+ #define DISABLE_THREAD
+ 
+ #INCLUDES += algo/vpx_common/vpx_mem/include
+ #INCLUDES += common
+ #INCLUDES += common
+ #INCLUDES += common
+ #INCLUDES += common
+ #INCLUDES += decoder
+ 
+ VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c
+ VP8_DX_SRCS-yes += decoder/dboolhuff.c
+ VP8_DX_SRCS-yes += decoder/decodemv.c
+ VP8_DX_SRCS-yes += decoder/decodframe.c
+ VP8_DX_SRCS-yes += decoder/detokenize.c
+ VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/ec_types.h
+ VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.h
+ VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.c
+ VP8_DX_SRCS-yes += decoder/dboolhuff.h
+ VP8_DX_SRCS-yes += decoder/decodemv.h
+ VP8_DX_SRCS-yes += decoder/decoderthreading.h
+ VP8_DX_SRCS-yes += decoder/detokenize.h
+ VP8_DX_SRCS-yes += decoder/onyxd_int.h
+ VP8_DX_SRCS-yes += decoder/treereader.h
+ VP8_DX_SRCS-yes += decoder/onyxd_if.c
+ VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
+ 
+ VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
++
++$(eval $(call asm_offsets_template,\
++         vp8_asm_dec_offsets.asm, $(VP8_PREFIX)decoder/asm_dec_offsets.c))
diff --cc vp9/common/rtcd_defs.sh
index f909c1898,000000000..fc9195427
mode 100644,000000..100644
--- a/vp9/common/rtcd_defs.sh
+++ b/vp9/common/rtcd_defs.sh
@@@ -1,518 -1,0 +1,518 @@@
 +common_forward_decls() {
 +cat <<EOF
 +
 +struct loop_filter_info;
 +struct blockd;
 +struct macroblockd;
 +struct loop_filter_info;
 +
 +/* Encoder forward decls */
 +struct block;
 +struct macroblock;
 +struct variance_vtable;
 +
 +/* Encoder forward decls */
 +struct variance_vtable;
 +union int_mv;
 +struct yv12_buffer_config;
 +EOF
 +}
 +forward_decls common_forward_decls
 +
 +prototype void vp9_filter_block2d_4x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
 +prototype void vp9_filter_block2d_8x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
 +prototype void vp9_filter_block2d_8x8_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
 +prototype void vp9_filter_block2d_16x16_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
 +
 +# At the very least, MSVC 2008 has compiler bug exhibited by this code; code
 +# compiles warning free but a dissassembly of generated code show bugs. To be
 +# on the safe side, only enabled when compiled with 'gcc'.
 +if [ "$CONFIG_GCC" = "yes" ]; then
 +    specialize vp9_filter_block2d_4x4_8 sse4_1 sse2
 +fi
 +    specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2
 +    specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2
 +    specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2
 +
 +#
 +# Dequant
 +#
 +prototype void vp9_dequantize_b "struct blockd *x"
 +specialize vp9_dequantize_b mmx
 +
 +prototype void vp9_dequantize_b_2x2 "struct blockd *x"
 +specialize vp9_dequantize_b_2x2
 +
 +prototype void vp9_dequant_dc_idct_add_y_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc, struct macroblockd *xd"
 +specialize vp9_dequant_dc_idct_add_y_block_8x8
 +
 +prototype void vp9_dequant_idct_add_y_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, struct macroblockd *xd"
 +specialize vp9_dequant_idct_add_y_block_8x8
 +
 +prototype void vp9_dequant_idct_add_uv_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs, struct macroblockd *xd"
 +specialize vp9_dequant_idct_add_uv_block_8x8
 +
 +prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
 +specialize vp9_dequant_idct_add_16x16
 +
 +prototype void vp9_dequant_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
 +specialize vp9_dequant_idct_add
 +
 +prototype void vp9_dequant_dc_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"
 +specialize vp9_dequant_dc_idct_add
 +
 +prototype void vp9_dequant_dc_idct_add_y_block "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc"
 +specialize vp9_dequant_dc_idct_add_y_block mmx
 +
 +prototype void vp9_dequant_idct_add_y_block "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs"
 +specialize vp9_dequant_idct_add_y_block mmx
 +
 +prototype void vp9_dequant_idct_add_uv_block "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs"
 +specialize vp9_dequant_idct_add_uv_block mmx
 +
 +#
 +# RECON
 +#
 +prototype void vp9_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
 +specialize vp9_copy_mem16x16 mmx sse2 media neon dspr2
 +vp9_copy_mem16x16_media=vp9_copy_mem16x16_v6
 +vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
 +
 +prototype void vp9_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
 +specialize vp9_copy_mem8x8 mmx media neon dspr2
 +vp9_copy_mem8x8_media=vp9_copy_mem8x8_v6
 +vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
 +
 +prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
 +specialize vp9_copy_mem8x4 mmx
 +
 +prototype void vp9_intra4x4_predict "unsigned char *Above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"
 +specialize vp9_intra4x4_predict
 +
 +prototype void vp9_avg_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
 +specialize vp9_avg_mem16x16
 +
 +prototype void vp9_avg_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
 +specialize vp9_avg_mem8x8
 +
 +prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
 +specialize vp9_copy_mem8x4 mmx media neon dspr2
 +vp9_copy_mem8x4_media=vp9_copy_mem8x4_v6
 +vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2
 +
 +prototype void vp9_recon_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
 +specialize vp9_recon_b
 +
 +prototype void vp9_recon_uv_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
 +specialize vp9_recon_uv_b
 +
 +prototype void vp9_recon2b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
 +specialize vp9_recon2b sse2
 +
 +prototype void vp9_recon4b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
 +specialize vp9_recon4b sse2
 +
 +prototype void vp9_recon_mb "struct macroblockd *x"
 +specialize vp9_recon_mb
 +
 +prototype void vp9_recon_mby "struct macroblockd *x"
 +specialize vp9_recon_mby
 +
 +prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
 +specialize vp9_build_intra_predictors_mby_s
 +
 +prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x"
 +specialize vp9_build_intra_predictors_sby_s;
 +
 +prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x"
 +specialize vp9_build_intra_predictors_sbuv_s;
 +
 +prototype void vp9_build_intra_predictors_mby "struct macroblockd *x"
 +specialize vp9_build_intra_predictors_mby;
 +
 +prototype void vp9_build_comp_intra_predictors_mby "struct macroblockd *x"
 +specialize vp9_build_comp_intra_predictors_mby;
 +
 +prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
 +specialize vp9_build_intra_predictors_mby_s;
 +
 +prototype void vp9_build_intra_predictors_mbuv "struct macroblockd *x"
 +specialize vp9_build_intra_predictors_mbuv;
 +
 +prototype void vp9_build_intra_predictors_mbuv_s "struct macroblockd *x"
 +specialize vp9_build_intra_predictors_mbuv_s;
 +
 +prototype void vp9_build_comp_intra_predictors_mbuv "struct macroblockd *x"
 +specialize vp9_build_comp_intra_predictors_mbuv;
 +
 +prototype void vp9_intra4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"
 +specialize vp9_intra4x4_predict;
 +
 +prototype void vp9_comp_intra4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
 +specialize vp9_comp_intra4x4_predict;
 +
 +prototype void vp9_intra8x8_predict "struct blockd *x, int b_mode, unsigned char *predictor"
 +specialize vp9_intra8x8_predict;
 +
 +prototype void vp9_comp_intra8x8_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
 +specialize vp9_comp_intra8x8_predict;
 +
 +prototype void vp9_intra_uv4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"
 +specialize vp9_intra_uv4x4_predict;
 +
 +prototype void vp9_comp_intra_uv4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
 +specialize vp9_comp_intra_uv4x4_predict;
 +
 +#
 +# Loopfilter
 +#
 +prototype void vp9_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 +specialize vp9_loop_filter_mbv sse2
 +
 +prototype void vp9_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 +specialize vp9_loop_filter_bv sse2
 +
 +prototype void vp9_loop_filter_bv8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 +specialize vp9_loop_filter_bv8x8 sse2
 +
 +prototype void vp9_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 +specialize vp9_loop_filter_mbh sse2
 +
 +prototype void vp9_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 +specialize vp9_loop_filter_bh sse2
 +
 +prototype void vp9_loop_filter_bh8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 +specialize vp9_loop_filter_bh8x8 sse2
 +
 +prototype void vp9_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
 +specialize vp9_loop_filter_simple_mbv mmx sse2 media neon
 +vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c
 +vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx
 +vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2
 +vp9_loop_filter_simple_mbv_media=vp9_loop_filter_simple_vertical_edge_armv6
 +vp9_loop_filter_simple_mbv_neon=vp9_loop_filter_mbvs_neon
 +
 +prototype void vp9_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"
 +specialize vp9_loop_filter_simple_mbh mmx sse2 media neon
 +vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c
 +vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx
 +vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2
 +vp9_loop_filter_simple_mbh_media=vp9_loop_filter_simple_horizontal_edge_armv6
 +vp9_loop_filter_simple_mbh_neon=vp9_loop_filter_mbhs_neon
 +
 +prototype void vp9_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"
 +specialize vp9_loop_filter_simple_bv mmx sse2 media neon
 +vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c
 +vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx
 +vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2
 +vp9_loop_filter_simple_bv_media=vp9_loop_filter_bvs_armv6
 +vp9_loop_filter_simple_bv_neon=vp9_loop_filter_bvs_neon
 +
 +prototype void vp9_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"
 +specialize vp9_loop_filter_simple_bh mmx sse2 media neon
 +vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c
 +vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx
 +vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
 +vp9_loop_filter_simple_bh_media=vp9_loop_filter_bhs_armv6
 +vp9_loop_filter_simple_bh_neon=vp9_loop_filter_bhs_neon
 +
 +#
 +# sad 16x3, 3x16
 +#
 +if [ "$CONFIG_NEWBESTREFMV" = "yes" ]; then
 +prototype unsigned int vp9_sad16x3 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
 +specialize vp9_sad16x3 sse2
 +
 +prototype unsigned int vp9_sad3x16 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
 +specialize vp9_sad3x16 sse2
 +fi
 +
 +#
 +# Encoder functions below this point.
 +#
 +if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then
 +
 +
 +# variance
 +[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
 +
 +prototype unsigned int vp9_variance32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
 +specialize vp9_variance32x32
 +
 +prototype unsigned int vp9_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
 +specialize vp9_variance16x16 mmx sse2
 +vp9_variance16x16_sse2=vp9_variance16x16_wmt
 +vp9_variance16x16_mmx=vp9_variance16x16_mmx
 +
 +prototype unsigned int vp9_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
 +specialize vp9_variance16x8 mmx sse2
 +vp9_variance16x8_sse2=vp9_variance16x8_wmt
 +vp9_variance16x8_mmx=vp9_variance16x8_mmx
 +
 +prototype unsigned int vp9_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
 +specialize vp9_variance8x16 mmx sse2
 +vp9_variance8x16_sse2=vp9_variance8x16_wmt
 +vp9_variance8x16_mmx=vp9_variance8x16_mmx
 +
 +prototype unsigned int vp9_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
 +specialize vp9_variance8x8 mmx sse2
 +vp9_variance8x8_sse2=vp9_variance8x8_wmt
 +vp9_variance8x8_mmx=vp9_variance8x8_mmx
 +
 +prototype unsigned int vp9_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
 +specialize vp9_variance4x4 mmx sse2
 +vp9_variance4x4_sse2=vp9_variance4x4_wmt
 +vp9_variance4x4_mmx=vp9_variance4x4_mmx
 +
 +prototype unsigned int vp9_sub_pixel_variance32x32 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
 +specialize vp9_sub_pixel_variance32x32
 +
 +prototype unsigned int vp9_sub_pixel_variance16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
 +specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
 +vp9_sub_pixel_variance16x16_sse2=vp9_sub_pixel_variance16x16_wmt
 +
 +prototype unsigned int vp9_sub_pixel_variance8x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
 +specialize vp9_sub_pixel_variance8x16 sse2 mmx
 +vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
 +
 +prototype unsigned int vp9_sub_pixel_variance16x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
 +specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
 +vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
 +vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
 +
 +prototype unsigned int vp9_sub_pixel_variance8x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
 +specialize vp9_sub_pixel_variance8x8 sse2 mmx
 +vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
 +
 +prototype unsigned int vp9_sub_pixel_variance4x4 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
 +specialize vp9_sub_pixel_variance4x4 sse2 mmx
 +vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
 +
 +prototype unsigned int vp9_sad32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
 +specialize vp9_sad32x32
 +
 +prototype unsigned int vp9_sad16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
 +specialize vp9_sad16x16 mmx sse2 sse3
 +vp9_sad16x16_sse2=vp9_sad16x16_wmt
 +
 +prototype unsigned int vp9_sad16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
 +specialize vp9_sad16x8 mmx sse2
 +vp9_sad16x8_sse2=vp9_sad16x8_wmt
 +
 +prototype unsigned int vp9_sad8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
 +specialize vp9_sad8x16 mmx sse2
 +vp9_sad8x16_sse2=vp9_sad8x16_wmt
 +
 +prototype unsigned int vp9_sad8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
 +specialize vp9_sad8x8 mmx sse2
 +vp9_sad8x8_sse2=vp9_sad8x8_wmt
 +
 +prototype unsigned int vp9_sad4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
 +specialize vp9_sad4x4 mmx sse2
 +vp9_sad4x4_sse2=vp9_sad4x4_wmt
 +
 +prototype unsigned int vp9_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
 +specialize vp9_variance_halfpixvar16x16_h mmx sse2
 +vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
 +
 +prototype unsigned int vp9_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
 +specialize vp9_variance_halfpixvar16x16_v mmx sse2
 +vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt
 +
 +prototype unsigned int vp9_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
 +specialize vp9_variance_halfpixvar16x16_hv mmx sse2
 +vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt
 +
 +prototype unsigned int vp9_variance_halfpixvar32x32_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
 +specialize vp9_variance_halfpixvar32x32_h
 +
 +prototype unsigned int vp9_variance_halfpixvar32x32_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
 +specialize vp9_variance_halfpixvar32x32_v
 +
 +prototype unsigned int vp9_variance_halfpixvar32x32_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
 +specialize vp9_variance_halfpixvar32x32_hv
 +
 +prototype void vp9_sad32x32x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
 +specialize vp9_sad32x32x3
 +
 +prototype void vp9_sad16x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
 +specialize vp9_sad16x16x3 sse3 ssse3
 +
 +prototype void vp9_sad16x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
 +specialize vp9_sad16x8x3 sse3 ssse3
 +
 +prototype void vp9_sad8x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
 +specialize vp9_sad8x16x3 sse3
 +
 +prototype void vp9_sad8x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
 +specialize vp9_sad8x8x3 sse3
 +
 +prototype void vp9_sad4x4x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
 +specialize vp9_sad4x4x3 sse3
 +
 +prototype void vp9_sad32x32x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
 +specialize vp9_sad32x32x8
 +
 +prototype void vp9_sad16x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
 +specialize vp9_sad16x16x8 sse4
 +
 +prototype void vp9_sad16x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
 +specialize vp9_sad16x8x8 sse4
 +
 +prototype void vp9_sad8x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
 +specialize vp9_sad8x16x8 sse4
 +
 +prototype void vp9_sad8x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
 +specialize vp9_sad8x8x8 sse4
 +
 +prototype void vp9_sad4x4x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
 +specialize vp9_sad4x4x8 sse4
 +
 +prototype void vp9_sad32x32x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
 +specialize vp9_sad32x32x4d
 +
 +prototype void vp9_sad16x16x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
 +specialize vp9_sad16x16x4d sse3
 +
 +prototype void vp9_sad16x8x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
 +specialize vp9_sad16x8x4d sse3
 +
 +prototype void vp9_sad8x16x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
 +specialize vp9_sad8x16x4d sse3
 +
 +prototype void vp9_sad8x8x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
 +specialize vp9_sad8x8x4d sse3
 +
 +prototype void vp9_sad4x4x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
 +specialize vp9_sad4x4x4d sse3
 +
 +#
 +# Block copy
 +#
 +case $arch in
 +    x86*)
 +    prototype void vp9_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"
 +    specialize vp9_copy32xn sse2 sse3
 +    ;;
 +esac
 +
 +prototype unsigned int vp9_sub_pixel_mse16x16 "const unsigned char  *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
 +specialize vp9_sub_pixel_mse16x16 sse2 mmx
 +vp9_sub_pixel_mse16x16_sse2=vp9_sub_pixel_mse16x16_wmt
 +
 +prototype unsigned int vp9_mse16x16 "const unsigned char *src_ptr, int  source_stride, const unsigned char *ref_ptr, int  recon_stride, unsigned int *sse"
 +specialize vp9_mse16x16 mmx sse2
 +vp9_mse16x16_sse2=vp9_mse16x16_wmt
 +
 +prototype unsigned int vp9_sub_pixel_mse32x32 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
 +specialize vp9_sub_pixel_mse32x32
 +
 +prototype unsigned int vp9_get_mb_ss "const short *"
 +specialize vp9_get_mb_ss mmx sse2
 +# ENCODEMB INVOKE
 +prototype int vp9_mbblock_error "struct macroblock *mb, int dc"
 +specialize vp9_mbblock_error mmx sse2
 +vp9_mbblock_error_sse2=vp9_mbblock_error_xmm
 +
 +prototype int vp9_block_error "short *coeff, short *dqcoeff, int block_size"
 +specialize vp9_block_error mmx sse2
 +vp9_block_error_sse2=vp9_block_error_xmm
 +
 +prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
 +specialize vp9_subtract_b mmx sse2
 +
 +prototype int vp9_mbuverror "struct macroblock *mb"
 +specialize vp9_mbuverror mmx sse2
 +vp9_mbuverror_sse2=vp9_mbuverror_xmm
 +
 +prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
 +specialize vp9_subtract_b mmx sse2
 +
 +prototype void vp9_subtract_mby "short *diff, unsigned char *src, unsigned char *pred, int stride"
 +specialize vp9_subtract_mby mmx sse2
 +
 +prototype void vp9_subtract_mbuv "short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride"
 +specialize vp9_subtract_mbuv mmx sse2
 +
 +#
 +# Structured Similarity (SSIM)
 +#
 +if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
 +    [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
 +
 +    prototype void vp9_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
 +    specialize vp9_ssim_parms_8x8 $sse2_on_x86_64
 +
 +    prototype void vp9_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
 +    specialize vp9_ssim_parms_16x16 $sse2_on_x86_64
 +fi
 +
 +# fdct functions
 +prototype void vp9_fht "const short *input, int pitch, short *output, int tx_type, int tx_dim"
 +specialize vp9_fht
 +
 +prototype void vp9_short_fdct8x8 "short *InputData, short *OutputData, int pitch"
 +specialize vp9_short_fdct8x8
 +
 +prototype void vp9_short_fhaar2x2 "short *InputData, short *OutputData, int pitch"
 +specialize vp9_short_fhaar2x2
 +
 +prototype void vp9_short_fdct4x4 "short *InputData, short *OutputData, int pitch"
 +specialize vp9_short_fdct4x4
 +
 +prototype void vp9_short_fdct8x4 "short *InputData, short *OutputData, int pitch"
 +specialize vp9_short_fdct8x4
 +
 +prototype void vp9_short_walsh4x4 "short *InputData, short *OutputData, int pitch"
 +specialize vp9_short_walsh4x4
 +
 +prototype void vp9_short_fdct16x16 "short *InputData, short *OutputData, int pitch"
 +specialize vp9_short_fdct16x16
 +
 +prototype void vp9_short_walsh4x4_lossless "short *InputData, short *OutputData, int pitch"
 +specialize vp9_short_walsh4x4_lossless
 +
 +prototype void vp9_short_walsh4x4_x8 "short *InputData, short *OutputData, int pitch"
 +specialize vp9_short_walsh4x4_x8
 +
 +prototype void vp9_short_walsh8x4_x8 "short *InputData, short *OutputData, int pitch"
 +specialize vp9_short_walsh8x4_x8
 +
 +fi
 +# end encoder functions
 +
 +# Scaler functions
- if [ "CONFIG_SPATIAL_RESAMPLING" != "yes" ]; then
++if [ "$CONFIG_SPATIAL_RESAMPLING" = "yes" ]; then
 +    prototype void vp8_horizontal_line_4_5_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
 +    prototype void vp8_vertical_band_4_5_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
 +    prototype void vp8_last_vertical_band_4_5_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
 +    prototype void vp8_horizontal_line_2_3_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
 +    prototype void vp8_vertical_band_2_3_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
 +    prototype void vp8_last_vertical_band_2_3_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
 +    prototype void vp8_horizontal_line_3_5_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
 +    prototype void vp8_vertical_band_3_5_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
 +    prototype void vp8_last_vertical_band_3_5_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
 +    prototype void vp8_horizontal_line_3_4_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
 +    prototype void vp8_vertical_band_3_4_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
 +    prototype void vp8_last_vertical_band_3_4_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
 +    prototype void vp8_horizontal_line_1_2_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
 +    prototype void vp8_vertical_band_1_2_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
 +    prototype void vp8_last_vertical_band_1_2_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
 +    prototype void vp8_horizontal_line_5_4_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
 +    prototype void vp8_vertical_band_5_4_scale "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
 +    prototype void vp8_horizontal_line_5_3_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
 +    prototype void vp8_vertical_band_5_3_scale "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
 +    prototype void vp8_horizontal_line_2_1_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
 +    prototype void vp8_vertical_band_2_1_scale "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
 +    prototype void vp8_vertical_band_2_1_scale_i "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
 +fi
 +
 +prototype void vp8_yv12_extend_frame_borders "struct yv12_buffer_config *ybf"
 +specialize vp8_yv12_extend_frame_borders neon
 +
 +prototype void vp8_yv12_copy_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
 +specialize vp8_yv12_copy_frame neon
 +
 +prototype void vp8_yv12_copy_y "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
 +specialize vp8_yv12_copy_y neon
 +
diff --cc vp9/encoder/arm/armv5te/boolhuff_armv5te.asm
index 180637e68,000000000..94e65ef8d
mode 100644,000000..100644
--- a/vp9/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ b/vp9/encoder/arm/armv5te/boolhuff_armv5te.asm
@@@ -1,286 -1,0 +1,286 @@@
 +;
 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 +;
 +;  Use of this source code is governed by a BSD-style license
 +;  that can be found in the LICENSE file in the root of the source
 +;  tree. An additional intellectual property rights grant can be found
 +;  in the file PATENTS.  All contributing project authors may
 +;  be found in the AUTHORS file in the root of the source tree.
 +;
 +
 +
 +    EXPORT |vp8_start_encode|
 +    EXPORT |vp9_encode_bool|
 +    EXPORT |vp8_stop_encode|
 +    EXPORT |vp8_encode_value|
 +
-     INCLUDE asm_enc_offsets.asm
++    INCLUDE vp9_asm_enc_offsets.asm
 +
 +    ARM
 +    REQUIRE8
 +    PRESERVE8
 +
 +    AREA    |.text|, CODE, READONLY
 +
 +; r0 BOOL_CODER *br
 +; r1 unsigned char *source
 +
 +|vp8_start_encode| PROC
 +    mov     r12, #0
 +    mov     r3,  #255
 +    mvn     r2,  #23
 +    str     r12, [r0, #vp9_writer_lowvalue]
 +    str     r3,  [r0, #vp9_writer_range]
 +    str     r12, [r0, #vp9_writer_value]
 +    str     r2,  [r0, #vp9_writer_count]
 +    str     r12, [r0, #vp9_writer_pos]
 +    str     r1,  [r0, #vp9_writer_buffer]
 +    bx      lr
 +    ENDP
 +
 +; r0 BOOL_CODER *br
 +; r1 int bit
 +; r2 int probability
 +|vp9_encode_bool| PROC
 +    push    {r4-r9, lr}
 +
 +    mov     r4, r2
 +
 +    ldr     r2, [r0, #vp9_writer_lowvalue]
 +    ldr     r5, [r0, #vp9_writer_range]
 +    ldr     r3, [r0, #vp9_writer_count]
 +
 +    sub     r7, r5, #1                  ; range-1
 +
 +    cmp     r1, #0
 +    mul     r6, r4, r7                  ; ((range-1) * probability)
 +
 +    mov     r7, #1
 +    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * probability) >> 8)
 +
 +    addne   r2, r2, r4                  ; if  (bit) lowvalue += split
 +    subne   r4, r5, r4                  ; if  (bit) range = range-split
 +
 +    ; Counting the leading zeros is used to normalize range.
 +    clz     r6, r4
 +    sub     r6, r6, #24                 ; shift
 +
 +    ; Flag is set on the sum of count.  This flag is used later
 +    ; to determine if count >= 0
 +    adds    r3, r3, r6                  ; count += shift
 +    lsl     r5, r4, r6                  ; range <<= shift
 +    bmi     token_count_lt_zero         ; if(count >= 0)
 +
 +    sub     r6, r6, r3                  ; offset = shift - count
 +    sub     r4, r6, #1                  ; offset-1
 +    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
 +    bpl     token_high_bit_not_set
 +
 +    ldr     r4, [r0, #vp9_writer_pos]   ; x
 +    sub     r4, r4, #1                  ; x = w->pos-1
 +    b       token_zero_while_start
 +token_zero_while_loop
 +    mov     r9, #0
 +    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
 +    sub     r4, r4, #1                  ; x--
 +token_zero_while_start
 +    cmp     r4, #0
 +    ldrge   r7, [r0, #vp9_writer_buffer]
 +    ldrb    r1, [r7, r4]
 +    cmpge   r1, #0xff
 +    beq     token_zero_while_loop
 +
 +    ldr     r7, [r0, #vp9_writer_buffer]
 +    ldrb    r9, [r7, r4]                ; w->buffer[x]
 +    add     r9, r9, #1
 +    strb    r9, [r7, r4]                ; w->buffer[x] + 1
 +token_high_bit_not_set
 +    rsb     r4, r6, #24                 ; 24-offset
 +    ldr     r9, [r0, #vp9_writer_buffer]
 +    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
 +    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
 +    lsl     r2, r2, r6                  ; lowvalue <<= offset
 +    mov     r6, r3                      ; shift = count
 +    add     r1, r4, #1                  ; w->pos++
 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
 +    str     r1, [r0, #vp9_writer_pos]
 +    sub     r3, r3, #8                  ; count -= 8
 +    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
 +
 +token_count_lt_zero
 +    lsl     r2, r2, r6                  ; lowvalue <<= shift
 +
 +    str     r2, [r0, #vp9_writer_lowvalue]
 +    str     r5, [r0, #vp9_writer_range]
 +    str     r3, [r0, #vp9_writer_count]
 +    pop     {r4-r9, pc}
 +    ENDP
 +
 +; r0 BOOL_CODER *br
 +|vp8_stop_encode| PROC
 +    push    {r4-r10, lr}
 +
 +    ldr     r2, [r0, #vp9_writer_lowvalue]
 +    ldr     r5, [r0, #vp9_writer_range]
 +    ldr     r3, [r0, #vp9_writer_count]
 +
 +    mov     r10, #32
 +
 +stop_encode_loop
 +    sub     r7, r5, #1                  ; range-1
 +
 +    mov     r4, r7, lsl #7              ; ((range-1) * 128)
 +
 +    mov     r7, #1
 +    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
 +
 +    ; Counting the leading zeros is used to normalize range.
 +    clz     r6, r4
 +    sub     r6, r6, #24                 ; shift
 +
 +    ; Flag is set on the sum of count.  This flag is used later
 +    ; to determine if count >= 0
 +    adds    r3, r3, r6                  ; count += shift
 +    lsl     r5, r4, r6                  ; range <<= shift
 +    bmi     token_count_lt_zero_se      ; if(count >= 0)
 +
 +    sub     r6, r6, r3                  ; offset = shift - count
 +    sub     r4, r6, #1                  ; offset-1
 +    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
 +    bpl     token_high_bit_not_set_se
 +
 +    ldr     r4, [r0, #vp9_writer_pos]   ; x
 +    sub     r4, r4, #1                  ; x = w->pos-1
 +    b       token_zero_while_start_se
 +token_zero_while_loop_se
 +    mov     r9, #0
 +    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
 +    sub     r4, r4, #1                  ; x--
 +token_zero_while_start_se
 +    cmp     r4, #0
 +    ldrge   r7, [r0, #vp9_writer_buffer]
 +    ldrb    r1, [r7, r4]
 +    cmpge   r1, #0xff
 +    beq     token_zero_while_loop_se
 +
 +    ldr     r7, [r0, #vp9_writer_buffer]
 +    ldrb    r9, [r7, r4]                ; w->buffer[x]
 +    add     r9, r9, #1
 +    strb    r9, [r7, r4]                ; w->buffer[x] + 1
 +token_high_bit_not_set_se
 +    rsb     r4, r6, #24                 ; 24-offset
 +    ldr     r9, [r0, #vp9_writer_buffer]
 +    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
 +    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
 +    lsl     r2, r2, r6                  ; lowvalue <<= offset
 +    mov     r6, r3                      ; shift = count
 +    add     r1, r4, #1                  ; w->pos++
 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
 +    str     r1, [r0, #vp9_writer_pos]
 +    sub     r3, r3, #8                  ; count -= 8
 +    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
 +
 +token_count_lt_zero_se
 +    lsl     r2, r2, r6                  ; lowvalue <<= shift
 +
 +    subs    r10, r10, #1
 +    bne     stop_encode_loop
 +
 +    str     r2, [r0, #vp9_writer_lowvalue]
 +    str     r5, [r0, #vp9_writer_range]
 +    str     r3, [r0, #vp9_writer_count]
 +    pop     {r4-r10, pc}
 +
 +    ENDP
 +
 +; r0 BOOL_CODER *br
 +; r1 int data
 +; r2 int bits
 +|vp8_encode_value| PROC
 +    push    {r4-r11, lr}
 +
 +    mov     r10, r2
 +
 +    ldr     r2, [r0, #vp9_writer_lowvalue]
 +    ldr     r5, [r0, #vp9_writer_range]
 +    ldr     r3, [r0, #vp9_writer_count]
 +
 +    rsb     r4, r10, #32                 ; 32-n
 +
 +    ; v is kept in r1 during the token pack loop
 +    lsl     r1, r1, r4                  ; r1 = v << 32 - n
 +
 +encode_value_loop
 +    sub     r7, r5, #1                  ; range-1
 +
 +    ; Decisions are made based on the bit value shifted
 +    ; off of v, so set a flag here based on this.
 +    ; This value is refered to as "bb"
 +    lsls    r1, r1, #1                  ; bit = v >> n
 +    mov     r4, r7, lsl #7              ; ((range-1) * 128)
 +
 +    mov     r7, #1
 +    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
 +
 +    addcs   r2, r2, r4                  ; if  (bit) lowvalue += split
 +    subcs   r4, r5, r4                  ; if  (bit) range = range-split
 +
 +    ; Counting the leading zeros is used to normalize range.
 +    clz     r6, r4
 +    sub     r6, r6, #24                 ; shift
 +
 +    ; Flag is set on the sum of count.  This flag is used later
 +    ; to determine if count >= 0
 +    adds    r3, r3, r6                  ; count += shift
 +    lsl     r5, r4, r6                  ; range <<= shift
 +    bmi     token_count_lt_zero_ev      ; if(count >= 0)
 +
 +    sub     r6, r6, r3                  ; offset = shift - count
 +    sub     r4, r6, #1                  ; offset-1
 +    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
 +    bpl     token_high_bit_not_set_ev
 +
 +    ldr     r4, [r0, #vp9_writer_pos]   ; x
 +    sub     r4, r4, #1                  ; x = w->pos-1
 +    b       token_zero_while_start_ev
 +token_zero_while_loop_ev
 +    mov     r9, #0
 +    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
 +    sub     r4, r4, #1                  ; x--
 +token_zero_while_start_ev
 +    cmp     r4, #0
 +    ldrge   r7, [r0, #vp9_writer_buffer]
 +    ldrb    r11, [r7, r4]
 +    cmpge   r11, #0xff
 +    beq     token_zero_while_loop_ev
 +
 +    ldr     r7, [r0, #vp9_writer_buffer]
 +    ldrb    r9, [r7, r4]                ; w->buffer[x]
 +    add     r9, r9, #1
 +    strb    r9, [r7, r4]                ; w->buffer[x] + 1
 +token_high_bit_not_set_ev
 +    rsb     r4, r6, #24                 ; 24-offset
 +    ldr     r9, [r0, #vp9_writer_buffer]
 +    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
 +    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
 +    lsl     r2, r2, r6                  ; lowvalue <<= offset
 +    mov     r6, r3                      ; shift = count
 +    add     r11, r4, #1                 ; w->pos++
 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
 +    str     r11, [r0, #vp9_writer_pos]
 +    sub     r3, r3, #8                  ; count -= 8
 +    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
 +
 +token_count_lt_zero_ev
 +    lsl     r2, r2, r6                  ; lowvalue <<= shift
 +
 +    subs    r10, r10, #1
 +    bne     encode_value_loop
 +
 +    str     r2, [r0, #vp9_writer_lowvalue]
 +    str     r5, [r0, #vp9_writer_range]
 +    str     r3, [r0, #vp9_writer_count]
 +    pop     {r4-r11, pc}
 +    ENDP
 +
 +    END
diff --cc vp9/encoder/arm/armv5te/vp8_packtokens_armv5.asm
index bf299770b,000000000..9ccbaa6c1
mode 100644,000000..100644
--- a/vp9/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ b/vp9/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@@ -1,291 -1,0 +1,291 @@@
 +;
 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 +;
 +;  Use of this source code is governed by a BSD-style license
 +;  that can be found in the LICENSE file in the root of the source
 +;  tree. An additional intellectual property rights grant can be found
 +;  in the file PATENTS.  All contributing project authors may
 +;  be found in the AUTHORS file in the root of the source tree.
 +;
 +
 +
 +    EXPORT |vp8cx_pack_tokens_armv5|
 +
-     INCLUDE asm_enc_offsets.asm
++    INCLUDE vp9_asm_enc_offsets.asm
 +
 +    ARM
 +    REQUIRE8
 +    PRESERVE8
 +
 +    AREA    |.text|, CODE, READONLY
 +
 +; r0 vp9_writer *w
 +; r1 const TOKENEXTRA *p
 +; r2 int xcount
 +; r3 vp8_coef_encodings
 +; s0 vp8_extra_bits
 +; s1 vp8_coef_tree
 +|vp8cx_pack_tokens_armv5| PROC
 +    push    {r4-r11, lr}
 +
 +    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
 +    ;  sizeof (TOKENEXTRA) is 8
 +    sub     sp, sp, #12
 +    add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)
 +    str     r2, [sp, #0]
 +    str     r3, [sp, #8]                ; save vp8_coef_encodings
 +    ldr     r2, [r0, #vp9_writer_lowvalue]
 +    ldr     r5, [r0, #vp9_writer_range]
 +    ldr     r3, [r0, #vp9_writer_count]
 +    b       check_p_lt_stop
 +
 +while_p_lt_stop
 +    ldrb    r6, [r1, #tokenextra_token] ; t
 +    ldr     r4, [sp, #8]                ; vp8_coef_encodings
 +    mov     lr, #0
 +    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
 +    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
 +
 +    ldrb    r7, [r1, #tokenextra_skip_eob_node]
 +
 +    ldr     r6, [r4, #vp9_token_value]  ; v
 +    ldr     r8, [r4, #vp9_token_len]    ; n
 +
 +    ; vp8 specific skip_eob_node
 +    cmp     r7, #0
 +    movne   lr, #2                      ; i = 2
 +    subne   r8, r8, #1                  ; --n
 +
 +    rsb     r4, r8, #32                 ; 32-n
 +    ldr     r10, [sp, #52]              ; vp8_coef_tree
 +
 +    ; v is kept in r12 during the token pack loop
 +    lsl     r12, r6, r4                ; r12 = v << 32 - n
 +
 +; loop start
 +token_loop
 +    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
 +    sub     r7, r5, #1                  ; range-1
 +
 +    ; Decisions are made based on the bit value shifted
 +    ; off of v, so set a flag here based on this.
 +    ; This value is refered to as "bb"
 +    lsls    r12, r12, #1                ; bb = v >> n
 +    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
 +
 +    ; bb can only be 0 or 1.  So only execute this statement
 +    ; if bb == 1, otherwise it will act like i + 0
 +    addcs   lr, lr, #1                  ; i + bb
 +
 +    mov     r7, #1
 +    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
 +    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
 +
 +    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
 +    subcs   r4, r5, r4                  ; if  (bb) range = range-split
 +
 +    ; Counting the leading zeros is used to normalize range.
 +    clz     r6, r4
 +    sub     r6, r6, #24                 ; shift
 +
 +    ; Flag is set on the sum of count.  This flag is used later
 +    ; to determine if count >= 0
 +    adds    r3, r3, r6                  ; count += shift
 +    lsl     r5, r4, r6                  ; range <<= shift
 +    bmi     token_count_lt_zero         ; if(count >= 0)
 +
 +    sub     r6, r6, r3                  ; offset = shift - count
 +    sub     r4, r6, #1                  ; offset-1
 +    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
 +    bpl     token_high_bit_not_set
 +
 +    ldr     r4, [r0, #vp9_writer_pos]   ; x
 +    sub     r4, r4, #1                  ; x = w->pos-1
 +    b       token_zero_while_start
 +token_zero_while_loop
 +    mov     r10, #0
 +    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
 +    sub     r4, r4, #1                  ; x--
 +token_zero_while_start
 +    cmp     r4, #0
 +    ldrge   r7, [r0, #vp9_writer_buffer]
 +    ldrb    r11, [r7, r4]
 +    cmpge   r11, #0xff
 +    beq     token_zero_while_loop
 +
 +    ldr     r7, [r0, #vp9_writer_buffer]
 +    ldrb    r10, [r7, r4]               ; w->buffer[x]
 +    add     r10, r10, #1
 +    strb    r10, [r7, r4]               ; w->buffer[x] + 1
 +token_high_bit_not_set
 +    rsb     r4, r6, #24                 ; 24-offset
 +    ldr     r10, [r0, #vp9_writer_buffer]
 +    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
 +    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
 +    lsl     r2, r2, r6                  ; lowvalue <<= offset
 +    mov     r6, r3                      ; shift = count
 +    add     r11, r4, #1                 ; w->pos++
 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
 +    str     r11, [r0, #vp9_writer_pos]
 +    sub     r3, r3, #8                  ; count -= 8
 +    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
 +
 +    ; r10 is used earlier in the loop, but r10 is used as
 +    ; temp variable here.  So after r10 is used, reload
 +    ; vp8_coef_tree_dcd into r10
 +    ldr     r10, [sp, #52]              ; vp8_coef_tree
 +
 +token_count_lt_zero
 +    lsl     r2, r2, r6                  ; lowvalue <<= shift
 +
 +    subs    r8, r8, #1                  ; --n
 +    bne     token_loop
 +
 +    ldrb    r6, [r1, #tokenextra_token] ; t
 +    ldr     r7, [sp, #48]               ; vp8_extra_bits
 +    ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
 +    ;  element.  Here vp9_extra_bit_struct == 16
 +    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
 +
 +    ldr     r4, [r12, #vp9_extra_bit_struct_base_val]
 +    cmp     r4, #0
 +    beq     skip_extra_bits
 +
 +;   if( b->base_val)
 +    ldr     r8, [r12, #vp9_extra_bit_struct_len] ; L
 +    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
 +    cmp     r8, #0                      ; if( L)
 +    beq     no_extra_bits
 +
 +    ldr     r9, [r12, #vp9_extra_bit_struct_prob]
 +    asr     r7, lr, #1                  ; v=e>>1
 +
 +    ldr     r10, [r12, #vp9_extra_bit_struct_tree]
 +    str     r10, [sp, #4]               ; b->tree
 +
 +    rsb     r4, r8, #32
 +    lsl     r12, r7, r4
 +
 +    mov     lr, #0                      ; i = 0
 +
 +extra_bits_loop
 +    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
 +    sub     r7, r5, #1                  ; range-1
 +    lsls    r12, r12, #1                ; v >> n
 +    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
 +    addcs   lr, lr, #1                  ; i + bb
 +
 +    mov     r7, #1
 +    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
 +    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
 +
 +    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
 +    subcs   r4, r5, r4                  ; if  (bb) range = range-split
 +
 +    clz     r6, r4
 +    sub     r6, r6, #24
 +
 +    adds    r3, r3, r6                  ; count += shift
 +    lsl     r5, r4, r6                  ; range <<= shift
 +    bmi     extra_count_lt_zero         ; if(count >= 0)
 +
 +    sub     r6, r6, r3                  ; offset= shift - count
 +    sub     r4, r6, #1                  ; offset-1
 +    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
 +    bpl     extra_high_bit_not_set
 +
 +    ldr     r4, [r0, #vp9_writer_pos]   ; x
 +    sub     r4, r4, #1                  ; x = w->pos - 1
 +    b       extra_zero_while_start
 +extra_zero_while_loop
 +    mov     r10, #0
 +    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
 +    sub     r4, r4, #1                  ; x--
 +extra_zero_while_start
 +    cmp     r4, #0
 +    ldrge   r7, [r0, #vp9_writer_buffer]
 +    ldrb    r11, [r7, r4]
 +    cmpge   r11, #0xff
 +    beq     extra_zero_while_loop
 +
 +    ldr     r7, [r0, #vp9_writer_buffer]
 +    ldrb    r10, [r7, r4]
 +    add     r10, r10, #1
 +    strb    r10, [r7, r4]
 +extra_high_bit_not_set
 +    rsb     r4, r6, #24                 ; 24-offset
 +    ldr     r10, [r0, #vp9_writer_buffer]
 +    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
 +    ldr     r4, [r0, #vp9_writer_pos]
 +    lsl     r2, r2, r6                  ; lowvalue <<= offset
 +    mov     r6, r3                      ; shift = count
 +    add     r11, r4, #1                 ; w->pos++
 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
 +    str     r11, [r0, #vp9_writer_pos]
 +    sub     r3, r3, #8                  ; count -= 8
 +    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
 +    ldr     r10, [sp, #4]               ; b->tree
 +extra_count_lt_zero
 +    lsl     r2, r2, r6
 +
 +    subs    r8, r8, #1                  ; --n
 +    bne     extra_bits_loop             ; while (n)
 +
 +no_extra_bits
 +    ldr     lr, [r1, #4]                ; e = p->Extra
 +    add     r4, r5, #1                  ; range + 1
 +    tst     lr, #1
 +    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
 +    addne   r2, r2, r4                  ; lowvalue += split
 +    subne   r4, r5, r4                  ; range = range-split
 +    tst     r2, #0x80000000             ; lowvalue & 0x80000000
 +    lsl     r5, r4, #1                  ; range <<= 1
 +    beq     end_high_bit_not_set
 +
 +    ldr     r4, [r0, #vp9_writer_pos]
 +    mov     r7, #0
 +    sub     r4, r4, #1
 +    b       end_zero_while_start
 +end_zero_while_loop
 +    strb    r7, [r6, r4]
 +    sub     r4, r4, #1                  ; x--
 +end_zero_while_start
 +    cmp     r4, #0
 +    ldrge   r6, [r0, #vp9_writer_buffer]
 +    ldrb    r12, [r6, r4]
 +    cmpge   r12, #0xff
 +    beq     end_zero_while_loop
 +
 +    ldr     r6, [r0, #vp9_writer_buffer]
 +    ldrb    r7, [r6, r4]
 +    add     r7, r7, #1
 +    strb    r7, [r6, r4]
 +end_high_bit_not_set
 +    adds    r3, r3, #1                  ; ++count
 +    lsl     r2, r2, #1                  ; lowvalue  <<= 1
 +    bne     end_count_zero
 +
 +    ldr     r4, [r0, #vp9_writer_pos]
 +    mvn     r3, #7
 +    ldr     r7, [r0, #vp9_writer_buffer]
 +    lsr     r6, r2, #24                 ; lowvalue >> 24
 +    add     r12, r4, #1                 ; w->pos++
 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
 +    str     r12, [r0, #0x10]
 +    strb    r6, [r7, r4]
 +end_count_zero
 +skip_extra_bits
 +    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
 +check_p_lt_stop
 +    ldr     r4, [sp, #0]                ; stop
 +    cmp     r1, r4                      ; while( p < stop)
 +    bcc     while_p_lt_stop
 +
 +    str     r2, [r0, #vp9_writer_lowvalue]
 +    str     r5, [r0, #vp9_writer_range]
 +    str     r3, [r0, #vp9_writer_count]
 +    add     sp, sp, #12
 +    pop     {r4-r11, pc}
 +    ENDP
 +
 +    END
diff --cc vp9/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
index a1c647d6c,000000000..0938ce1a3
mode 100644,000000..100644
--- a/vp9/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ b/vp9/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@@ -1,327 -1,0 +1,327 @@@
 +;
 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 +;
 +;  Use of this source code is governed by a BSD-style license
 +;  that can be found in the LICENSE file in the root of the source
 +;  tree. An additional intellectual property rights grant can be found
 +;  in the file PATENTS.  All contributing project authors may
 +;  be found in the AUTHORS file in the root of the source tree.
 +;
 +
 +
 +    EXPORT |vp8cx_pack_mb_row_tokens_armv5|
 +
-     INCLUDE asm_enc_offsets.asm
++    INCLUDE vp9_asm_enc_offsets.asm
 +
 +    ARM
 +    REQUIRE8
 +    PRESERVE8
 +
 +    AREA    |.text|, CODE, READONLY
 +
 +; r0 VP8_COMP *cpi
 +; r1 vp9_writer *w
 +; r2 vp8_coef_encodings
 +; r3 vp8_extra_bits
 +; s0 vp8_coef_tree
 +
 +|vp8cx_pack_mb_row_tokens_armv5| PROC
 +    push    {r4-r11, lr}
 +    sub     sp, sp, #24
 +
 +    ; Compute address of cpi->common.mb_rows
 +    ldr     r4, _VP8_COMP_common_
 +    ldr     r6, _VP8_COMMON_MBrows_
 +    add     r4, r0, r4
 +
 +    ldr     r5, [r4, r6]                ; load up mb_rows
 +
 +    str     r2, [sp, #20]               ; save vp8_coef_encodings
 +    str     r5, [sp, #12]               ; save mb_rows
 +    str     r3, [sp, #8]                ; save vp8_extra_bits
 +
 +    ldr     r4, _VP8_COMP_tplist_
 +    add     r4, r0, r4
 +    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
 +
 +    mov     r0, r1                      ; keep same as other loops
 +
 +    ldr     r2, [r0, #vp9_writer_lowvalue]
 +    ldr     r5, [r0, #vp9_writer_range]
 +    ldr     r3, [r0, #vp9_writer_count]
 +
 +mb_row_loop
 +
 +    ldr     r1, [r7, #tokenlist_start]
 +    ldr     r9, [r7, #tokenlist_stop]
 +    str     r9, [sp, #0]                ; save stop for later comparison
 +    str     r7, [sp, #16]               ; tokenlist address for next time
 +
 +    b       check_p_lt_stop
 +
 +    ; actuall work gets done here!
 +
 +while_p_lt_stop
 +    ldrb    r6, [r1, #tokenextra_token] ; t
 +    ldr     r4, [sp, #20]               ; vp8_coef_encodings
 +    mov     lr, #0
 +    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
 +    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
 +
 +    ldrb    r7, [r1, #tokenextra_skip_eob_node]
 +
 +    ldr     r6, [r4, #vp9_token_value]  ; v
 +    ldr     r8, [r4, #vp9_token_len]    ; n
 +
 +    ; vp8 specific skip_eob_node
 +    cmp     r7, #0
 +    movne   lr, #2                      ; i = 2
 +    subne   r8, r8, #1                  ; --n
 +
 +    rsb     r4, r8, #32                 ; 32-n
 +    ldr     r10, [sp, #60]              ; vp8_coef_tree
 +
 +    ; v is kept in r12 during the token pack loop
 +    lsl     r12, r6, r4                 ; r12 = v << 32 - n
 +
 +; loop start
 +token_loop
 +    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
 +    sub     r7, r5, #1                  ; range-1
 +
 +    ; Decisions are made based on the bit value shifted
 +    ; off of v, so set a flag here based on this.
 +    ; This value is refered to as "bb"
 +    lsls    r12, r12, #1                ; bb = v >> n
 +    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
 +
 +    ; bb can only be 0 or 1.  So only execute this statement
 +    ; if bb == 1, otherwise it will act like i + 0
 +    addcs   lr, lr, #1                  ; i + bb
 +
 +    mov     r7, #1
 +    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
 +    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
 +
 +    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
 +    subcs   r4, r5, r4                  ; if  (bb) range = range-split
 +
 +    ; Counting the leading zeros is used to normalize range.
 +    clz     r6, r4
 +    sub     r6, r6, #24                 ; shift
 +
 +    ; Flag is set on the sum of count.  This flag is used later
 +    ; to determine if count >= 0
 +    adds    r3, r3, r6                  ; count += shift
 +    lsl     r5, r4, r6                  ; range <<= shift
 +    bmi     token_count_lt_zero         ; if(count >= 0)
 +
 +    sub     r6, r6, r3                  ; offset = shift - count
 +    sub     r4, r6, #1                  ; offset-1
 +    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
 +    bpl     token_high_bit_not_set
 +
 +    ldr     r4, [r0, #vp9_writer_pos]   ; x
 +    sub     r4, r4, #1                  ; x = w->pos-1
 +    b       token_zero_while_start
 +token_zero_while_loop
 +    mov     r10, #0
 +    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
 +    sub     r4, r4, #1                  ; x--
 +token_zero_while_start
 +    cmp     r4, #0
 +    ldrge   r7, [r0, #vp9_writer_buffer]
 +    ldrb    r11, [r7, r4]
 +    cmpge   r11, #0xff
 +    beq     token_zero_while_loop
 +
 +    ldr     r7, [r0, #vp9_writer_buffer]
 +    ldrb    r10, [r7, r4]               ; w->buffer[x]
 +    add     r10, r10, #1
 +    strb    r10, [r7, r4]               ; w->buffer[x] + 1
 +token_high_bit_not_set
 +    rsb     r4, r6, #24                 ; 24-offset
 +    ldr     r10, [r0, #vp9_writer_buffer]
 +    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
 +    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
 +    lsl     r2, r2, r6                  ; lowvalue <<= offset
 +    mov     r6, r3                      ; shift = count
 +    add     r11, r4, #1                 ; w->pos++
 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
 +    str     r11, [r0, #vp9_writer_pos]
 +    sub     r3, r3, #8                  ; count -= 8
 +    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
 +
 +    ; r10 is used earlier in the loop, but r10 is used as
 +    ; temp variable here.  So after r10 is used, reload
 +    ; vp8_coef_tree_dcd into r10
 +    ldr     r10, [sp, #60]              ; vp8_coef_tree
 +
 +token_count_lt_zero
 +    lsl     r2, r2, r6                  ; lowvalue <<= shift
 +
 +    subs    r8, r8, #1                  ; --n
 +    bne     token_loop
 +
 +    ldrb    r6, [r1, #tokenextra_token] ; t
 +    ldr     r7, [sp, #8]                ; vp8_extra_bits
 +    ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
 +    ;  element.  Here vp9_extra_bit_struct == 16
 +    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
 +
 +    ldr     r4, [r12, #vp9_extra_bit_struct_base_val]
 +    cmp     r4, #0
 +    beq     skip_extra_bits
 +
 +;   if( b->base_val)
 +    ldr     r8, [r12, #vp9_extra_bit_struct_len] ; L
 +    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
 +    cmp     r8, #0                      ; if( L)
 +    beq     no_extra_bits
 +
 +    ldr     r9, [r12, #vp9_extra_bit_struct_prob]
 +    asr     r7, lr, #1                  ; v=e>>1
 +
 +    ldr     r10, [r12, #vp9_extra_bit_struct_tree]
 +    str     r10, [sp, #4]               ; b->tree
 +
 +    rsb     r4, r8, #32
 +    lsl     r12, r7, r4
 +
 +    mov     lr, #0                      ; i = 0
 +
 +extra_bits_loop
 +    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
 +    sub     r7, r5, #1                  ; range-1
 +    lsls    r12, r12, #1                ; v >> n
 +    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
 +    addcs   lr, lr, #1                  ; i + bb
 +
 +    mov     r7, #1
 +    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
 +    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
 +
 +    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
 +    subcs   r4, r5, r4                  ; if  (bb) range = range-split
 +
 +    clz     r6, r4
 +    sub     r6, r6, #24
 +
 +    adds    r3, r3, r6                  ; count += shift
 +    lsl     r5, r4, r6                  ; range <<= shift
 +    bmi     extra_count_lt_zero         ; if(count >= 0)
 +
 +    sub     r6, r6, r3                  ; offset= shift - count
 +    sub     r4, r6, #1                  ; offset-1
 +    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
 +    bpl     extra_high_bit_not_set
 +
 +    ldr     r4, [r0, #vp9_writer_pos]   ; x
 +    sub     r4, r4, #1                  ; x = w->pos - 1
 +    b       extra_zero_while_start
 +extra_zero_while_loop
 +    mov     r10, #0
 +    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
 +    sub     r4, r4, #1                  ; x--
 +extra_zero_while_start
 +    cmp     r4, #0
 +    ldrge   r7, [r0, #vp9_writer_buffer]
 +    ldrb    r11, [r7, r4]
 +    cmpge   r11, #0xff
 +    beq     extra_zero_while_loop
 +
 +    ldr     r7, [r0, #vp9_writer_buffer]
 +    ldrb    r10, [r7, r4]
 +    add     r10, r10, #1
 +    strb    r10, [r7, r4]
 +extra_high_bit_not_set
 +    rsb     r4, r6, #24                 ; 24-offset
 +    ldr     r10, [r0, #vp9_writer_buffer]
 +    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
 +    ldr     r4, [r0, #vp9_writer_pos]
 +    lsl     r2, r2, r6                  ; lowvalue <<= offset
 +    mov     r6, r3                      ; shift = count
 +    add     r11, r4, #1                 ; w->pos++
 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
 +    str     r11, [r0, #vp9_writer_pos]
 +    sub     r3, r3, #8                  ; count -= 8
 +    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
 +    ldr     r10, [sp, #4]               ; b->tree
 +extra_count_lt_zero
 +    lsl     r2, r2, r6
 +
 +    subs    r8, r8, #1                  ; --n
 +    bne     extra_bits_loop             ; while (n)
 +
 +no_extra_bits
 +    ldr     lr, [r1, #4]                ; e = p->Extra
 +    add     r4, r5, #1                  ; range + 1
 +    tst     lr, #1
 +    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
 +    addne   r2, r2, r4                  ; lowvalue += split
 +    subne   r4, r5, r4                  ; range = range-split
 +    tst     r2, #0x80000000             ; lowvalue & 0x80000000
 +    lsl     r5, r4, #1                  ; range <<= 1
 +    beq     end_high_bit_not_set
 +
 +    ldr     r4, [r0, #vp9_writer_pos]
 +    mov     r7, #0
 +    sub     r4, r4, #1
 +    b       end_zero_while_start
 +end_zero_while_loop
 +    strb    r7, [r6, r4]
 +    sub     r4, r4, #1                  ; x--
 +end_zero_while_start
 +    cmp     r4, #0
 +    ldrge   r6, [r0, #vp9_writer_buffer]
 +    ldrb    r12, [r6, r4]
 +    cmpge   r12, #0xff
 +    beq     end_zero_while_loop
 +
 +    ldr     r6, [r0, #vp9_writer_buffer]
 +    ldrb    r7, [r6, r4]
 +    add     r7, r7, #1
 +    strb    r7, [r6, r4]
 +end_high_bit_not_set
 +    adds    r3, r3, #1                  ; ++count
 +    lsl     r2, r2, #1                  ; lowvalue  <<= 1
 +    bne     end_count_zero
 +
 +    ldr     r4, [r0, #vp9_writer_pos]
 +    mvn     r3, #7
 +    ldr     r7, [r0, #vp9_writer_buffer]
 +    lsr     r6, r2, #24                 ; lowvalue >> 24
 +    add     r12, r4, #1                 ; w->pos++
 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
 +    str     r12, [r0, #0x10]
 +    strb    r6, [r7, r4]
 +end_count_zero
 +skip_extra_bits
 +    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
 +check_p_lt_stop
 +    ldr     r4, [sp, #0]                ; stop
 +    cmp     r1, r4                      ; while( p < stop)
 +    bcc     while_p_lt_stop
 +
 +    ldr     r6, [sp, #12]               ; mb_rows
 +    ldr     r7, [sp, #16]               ; tokenlist address
 +    subs    r6, r6, #1
 +    add     r7, r7, #TOKENLIST_SZ       ; next element in the array
 +    str     r6, [sp, #12]
 +    bne     mb_row_loop
 +
 +    str     r2, [r0, #vp9_writer_lowvalue]
 +    str     r5, [r0, #vp9_writer_range]
 +    str     r3, [r0, #vp9_writer_count]
 +    add     sp, sp, #24
 +    pop     {r4-r11, pc}
 +    ENDP
 +
 +_VP8_COMP_common_
 +    DCD     vp8_comp_common
 +_VP8_COMMON_MBrows_
 +    DCD     vp8_common_mb_rows
 +_VP8_COMP_tplist_
 +    DCD     vp8_comp_tplist
 +
 +    END
diff --cc vp9/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
index 86c2feb4a,000000000..4611b407d
mode 100644,000000..100644
--- a/vp9/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp9/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@@ -1,465 -1,0 +1,465 @@@
 +;
 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 +;
 +;  Use of this source code is governed by a BSD-style license
 +;  that can be found in the LICENSE file in the root of the source
 +;  tree. An additional intellectual property rights grant can be found
 +;  in the file PATENTS.  All contributing project authors may
 +;  be found in the AUTHORS file in the root of the source tree.
 +;
 +
 +
 +    EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
 +
-     INCLUDE asm_enc_offsets.asm
++    INCLUDE vp9_asm_enc_offsets.asm
 +
 +    ARM
 +    REQUIRE8
 +    PRESERVE8
 +
 +    AREA    |.text|, CODE, READONLY
 +
 +; r0 VP8_COMP *cpi
 +; r1 unsigned char *cx_data
 +; r2 int num_part
 +; r3 *size
 +; s0 vp8_coef_encodings
 +; s1 vp8_extra_bits,
 +; s2 const vp9_tree_index *,
 +
 +|vp8cx_pack_tokens_into_partitions_armv5| PROC
 +    push    {r4-r11, lr}
 +    sub     sp, sp, #44
 +
 +    ; Compute address of cpi->common.mb_rows
 +    ldr     r4, _VP8_COMP_common_
 +    ldr     r6, _VP8_COMMON_MBrows_
 +    add     r4, r0, r4
 +
 +    ldr     r5, [r4, r6]                ; load up mb_rows
 +
 +    str     r5, [sp, #36]               ; save mb_rows
 +    str     r1, [sp, #24]               ; save cx_data
 +    str     r2, [sp, #20]               ; save num_part
 +    str     r3, [sp, #8]                ; save *size
 +
 +    ; *size = 3*(num_part -1 );
 +    sub     r2, r2, #1                  ; num_part - 1
 +    add     r2, r2, r2, lsl #1          ; 3*(num_part - 1)
 +    str     r2, [r3]
 +
 +    add     r2, r2, r1                  ; cx_data + *size
 +    str     r2, [sp, #40]               ; ptr
 +
 +    ldr     r4, _VP8_COMP_tplist_
 +    add     r4, r0, r4
 +    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
 +    str     r7, [sp, #32]               ; store start of cpi->tp_list
 +
 +    ldr     r11, _VP8_COMP_bc2_         ; load up vp9_writer out of cpi
 +    add     r0, r0, r11
 +
 +    mov     r11, #0
 +    str     r11, [sp, #28]              ; i
 +
 +numparts_loop
 +    ldr     r10, [sp, #40]              ; ptr
 +    ldr     r5,  [sp, #36]              ; move mb_rows to the counting section
 +    sub     r5, r5, r11                 ; move start point with each partition
 +                                        ; mb_rows starts at i
 +    str     r5,  [sp, #12]
 +
 +    ; Reset all of the VP8 Writer data for each partition that
 +    ; is processed.
 +    ; start_encode
 +    mov     r2, #0                      ; vp9_writer_lowvalue
 +    mov     r5, #255                    ; vp9_writer_range
 +    mvn     r3, #23                     ; vp9_writer_count
 +
 +    str     r2,  [r0, #vp9_writer_value]
 +    str     r2,  [r0, #vp9_writer_pos]
 +    str     r10, [r0, #vp9_writer_buffer]
 +
 +mb_row_loop
 +
 +    ldr     r1, [r7, #tokenlist_start]
 +    ldr     r9, [r7, #tokenlist_stop]
 +    str     r9, [sp, #0]                ; save stop for later comparison
 +    str     r7, [sp, #16]               ; tokenlist address for next time
 +
 +    b       check_p_lt_stop
 +
 +    ; actual work gets done here!
 +
 +while_p_lt_stop
 +    ldrb    r6, [r1, #tokenextra_token] ; t
 +    ldr     r4, [sp, #80]               ; vp8_coef_encodings
 +    mov     lr, #0
 +    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
 +    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
 +
 +    ldrb    r7, [r1, #tokenextra_skip_eob_node]
 +
 +    ldr     r6, [r4, #vp9_token_value]  ; v
 +    ldr     r8, [r4, #vp9_token_len]    ; n
 +
 +    ; vp8 specific skip_eob_node
 +    cmp     r7, #0
 +    movne   lr, #2                      ; i = 2
 +    subne   r8, r8, #1                  ; --n
 +
 +    rsb     r4, r8, #32                 ; 32-n
 +    ldr     r10, [sp, #88]              ; vp8_coef_tree
 +
 +    ; v is kept in r12 during the token pack loop
 +    lsl     r12, r6, r4                ; r12 = v << 32 - n
 +
 +; loop start
 +token_loop
 +    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
 +    sub     r7, r5, #1                  ; range-1
 +
 +    ; Decisions are made based on the bit value shifted
 +    ; off of v, so set a flag here based on this.
 +    ; This value is refered to as "bb"
 +    lsls    r12, r12, #1                ; bb = v >> n
 +    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
 +
 +    ; bb can only be 0 or 1.  So only execute this statement
 +    ; if bb == 1, otherwise it will act like i + 0
 +    addcs   lr, lr, #1                  ; i + bb
 +
 +    mov     r7, #1
 +    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
 +    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
 +
 +    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
 +    subcs   r4, r5, r4                  ; if  (bb) range = range-split
 +
 +    ; Counting the leading zeros is used to normalize range.
 +    clz     r6, r4
 +    sub     r6, r6, #24                 ; shift
 +
 +    ; Flag is set on the sum of count.  This flag is used later
 +    ; to determine if count >= 0
 +    adds    r3, r3, r6                  ; count += shift
 +    lsl     r5, r4, r6                  ; range <<= shift
 +    bmi     token_count_lt_zero         ; if(count >= 0)
 +
 +    sub     r6, r6, r3                  ; offset = shift - count
 +    sub     r4, r6, #1                  ; offset-1
 +    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
 +    bpl     token_high_bit_not_set
 +
 +    ldr     r4, [r0, #vp9_writer_pos]   ; x
 +    sub     r4, r4, #1                  ; x = w->pos-1
 +    b       token_zero_while_start
 +token_zero_while_loop
 +    mov     r10, #0
 +    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
 +    sub     r4, r4, #1                  ; x--
 +token_zero_while_start
 +    cmp     r4, #0
 +    ldrge   r7, [r0, #vp9_writer_buffer]
 +    ldrb    r11, [r7, r4]
 +    cmpge   r11, #0xff
 +    beq     token_zero_while_loop
 +
 +    ldr     r7, [r0, #vp9_writer_buffer]
 +    ldrb    r10, [r7, r4]               ; w->buffer[x]
 +    add     r10, r10, #1
 +    strb    r10, [r7, r4]               ; w->buffer[x] + 1
 +token_high_bit_not_set
 +    rsb     r4, r6, #24                 ; 24-offset
 +    ldr     r10, [r0, #vp9_writer_buffer]
 +    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
 +    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
 +    lsl     r2, r2, r6                  ; lowvalue <<= offset
 +    mov     r6, r3                      ; shift = count
 +    add     r11, r4, #1                 ; w->pos++
 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
 +    str     r11, [r0, #vp9_writer_pos]
 +    sub     r3, r3, #8                  ; count -= 8
 +    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
 +
 +    ; r10 is used earlier in the loop, but r10 is used as
 +    ; temp variable here.  So after r10 is used, reload
 +    ; vp8_coef_tree_dcd into r10
 +    ldr     r10, [sp, #88]              ; vp8_coef_tree
 +
 +token_count_lt_zero
 +    lsl     r2, r2, r6                  ; lowvalue <<= shift
 +
 +    subs    r8, r8, #1                  ; --n
 +    bne     token_loop
 +
 +    ldrb    r6, [r1, #tokenextra_token] ; t
 +    ldr     r7, [sp, #84]                ; vp8_extra_bits
 +    ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
 +    ;  element.  Here vp9_extra_bit_struct == 16
 +    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
 +
 +    ldr     r4, [r12, #vp9_extra_bit_struct_base_val]
 +    cmp     r4, #0
 +    beq     skip_extra_bits
 +
 +;   if( b->base_val)
 +    ldr     r8, [r12, #vp9_extra_bit_struct_len] ; L
 +    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
 +    cmp     r8, #0                      ; if( L)
 +    beq     no_extra_bits
 +
 +    ldr     r9, [r12, #vp9_extra_bit_struct_prob]
 +    asr     r7, lr, #1                  ; v=e>>1
 +
 +    ldr     r10, [r12, #vp9_extra_bit_struct_tree]
 +    str     r10, [sp, #4]               ; b->tree
 +
 +    rsb     r4, r8, #32
 +    lsl     r12, r7, r4
 +
 +    mov     lr, #0                      ; i = 0
 +
 +extra_bits_loop
 +    ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]
 +    sub     r7, r5, #1                  ; range-1
 +    lsls    r12, r12, #1                ; v >> n
 +    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
 +    addcs   lr, lr, #1                  ; i + bb
 +
 +    mov     r7, #1
 +    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
 +    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
 +
 +    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
 +    subcs   r4, r5, r4                  ; if  (bb) range = range-split
 +
 +    clz     r6, r4
 +    sub     r6, r6, #24
 +
 +    adds    r3, r3, r6                  ; count += shift
 +    lsl     r5, r4, r6                  ; range <<= shift
 +    bmi     extra_count_lt_zero         ; if(count >= 0)
 +
 +    sub     r6, r6, r3                  ; offset= shift - count
 +    sub     r4, r6, #1                  ; offset-1
 +    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
 +    bpl     extra_high_bit_not_set
 +
 +    ldr     r4, [r0, #vp9_writer_pos]   ; x
 +    sub     r4, r4, #1                  ; x = w->pos - 1
 +    b       extra_zero_while_start
 +extra_zero_while_loop
 +    mov     r10, #0
 +    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
 +    sub     r4, r4, #1                  ; x--
 +extra_zero_while_start
 +    cmp     r4, #0
 +    ldrge   r7, [r0, #vp9_writer_buffer]
 +    ldrb    r11, [r7, r4]
 +    cmpge   r11, #0xff
 +    beq     extra_zero_while_loop
 +
 +    ldr     r7, [r0, #vp9_writer_buffer]
 +    ldrb    r10, [r7, r4]
 +    add     r10, r10, #1
 +    strb    r10, [r7, r4]
 +extra_high_bit_not_set
 +    rsb     r4, r6, #24                 ; 24-offset
 +    ldr     r10, [r0, #vp9_writer_buffer]
 +    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
 +    ldr     r4, [r0, #vp9_writer_pos]
 +    lsl     r2, r2, r6                  ; lowvalue <<= offset
 +    mov     r6, r3                      ; shift = count
 +    add     r11, r4, #1                 ; w->pos++
 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
 +    str     r11, [r0, #vp9_writer_pos]
 +    sub     r3, r3, #8                  ; count -= 8
 +    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
 +    ldr     r10, [sp, #4]               ; b->tree
 +extra_count_lt_zero
 +    lsl     r2, r2, r6
 +
 +    subs    r8, r8, #1                  ; --n
 +    bne     extra_bits_loop             ; while (n)
 +
 +no_extra_bits
 +    ldr     lr, [r1, #4]                ; e = p->Extra
 +    add     r4, r5, #1                  ; range + 1
 +    tst     lr, #1
 +    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
 +    addne   r2, r2, r4                  ; lowvalue += split
 +    subne   r4, r5, r4                  ; range = range-split
 +    tst     r2, #0x80000000             ; lowvalue & 0x80000000
 +    lsl     r5, r4, #1                  ; range <<= 1
 +    beq     end_high_bit_not_set
 +
 +    ldr     r4, [r0, #vp9_writer_pos]
 +    mov     r7, #0
 +    sub     r4, r4, #1
 +    b       end_zero_while_start
 +end_zero_while_loop
 +    strb    r7, [r6, r4]
 +    sub     r4, r4, #1                  ; x--
 +end_zero_while_start
 +    cmp     r4, #0
 +    ldrge   r6, [r0, #vp9_writer_buffer]
 +    ldrb    r12, [r6, r4]
 +    cmpge   r12, #0xff
 +    beq     end_zero_while_loop
 +
 +    ldr     r6, [r0, #vp9_writer_buffer]
 +    ldrb    r7, [r6, r4]
 +    add     r7, r7, #1
 +    strb    r7, [r6, r4]
 +end_high_bit_not_set
 +    adds    r3, r3, #1                  ; ++count
 +    lsl     r2, r2, #1                  ; lowvalue  <<= 1
 +    bne     end_count_zero
 +
 +    ldr     r4, [r0, #vp9_writer_pos]
 +    mvn     r3, #7
 +    ldr     r7, [r0, #vp9_writer_buffer]
 +    lsr     r6, r2, #24                 ; lowvalue >> 24
 +    add     r12, r4, #1                 ; w->pos++
 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
 +    str     r12, [r0, #0x10]
 +    strb    r6, [r7, r4]
 +end_count_zero
 +skip_extra_bits
 +    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
 +check_p_lt_stop
 +    ldr     r4, [sp, #0]                ; stop
 +    cmp     r1, r4                      ; while( p < stop)
 +    bcc     while_p_lt_stop
 +
 +    ldr     r10, [sp, #20]              ; num_parts
 +    mov     r1, #TOKENLIST_SZ
 +    mul     r1, r10, r1
 +
 +    ldr     r6, [sp, #12]               ; mb_rows
 +    ldr     r7, [sp, #16]               ; tokenlist address
 +    subs    r6, r6, r10
 +    add     r7, r7, r1                  ; next element in the array
 +    str     r6, [sp, #12]
 +    bgt     mb_row_loop
 +
 +    mov     r12, #32
 +
 +stop_encode_loop
 +    sub     r7, r5, #1                  ; range-1
 +
 +    mov     r4, r7, lsl #7              ; ((range-1) * 128)
 +
 +    mov     r7, #1
 +    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
 +
 +    ; Counting the leading zeros is used to normalize range.
 +    clz     r6, r4
 +    sub     r6, r6, #24                 ; shift
 +
 +    ; Flag is set on the sum of count.  This flag is used later
 +    ; to determine if count >= 0
 +    adds    r3, r3, r6                  ; count += shift
 +    lsl     r5, r4, r6                  ; range <<= shift
 +    bmi     token_count_lt_zero_se      ; if(count >= 0)
 +
 +    sub     r6, r6, r3                  ; offset = shift - count
 +    sub     r4, r6, #1                  ; offset-1
 +    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
 +    bpl     token_high_bit_not_set_se
 +
 +    ldr     r4, [r0, #vp9_writer_pos]   ; x
 +    sub     r4, r4, #1                  ; x = w->pos-1
 +    b       token_zero_while_start_se
 +token_zero_while_loop_se
 +    mov     r10, #0
 +    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
 +    sub     r4, r4, #1                  ; x--
 +token_zero_while_start_se
 +    cmp     r4, #0
 +    ldrge   r7, [r0, #vp9_writer_buffer]
 +    ldrb    r11, [r7, r4]
 +    cmpge   r11, #0xff
 +    beq     token_zero_while_loop_se
 +
 +    ldr     r7, [r0, #vp9_writer_buffer]
 +    ldrb    r10, [r7, r4]               ; w->buffer[x]
 +    add     r10, r10, #1
 +    strb    r10, [r7, r4]               ; w->buffer[x] + 1
 +token_high_bit_not_set_se
 +    rsb     r4, r6, #24                 ; 24-offset
 +    ldr     r10, [r0, #vp9_writer_buffer]
 +    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
 +    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
 +    lsl     r2, r2, r6                  ; lowvalue <<= offset
 +    mov     r6, r3                      ; shift = count
 +    add     r11, r4, #1                 ; w->pos++
 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
 +    str     r11, [r0, #vp9_writer_pos]
 +    sub     r3, r3, #8                  ; count -= 8
 +    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
 +
 +token_count_lt_zero_se
 +    lsl     r2, r2, r6                  ; lowvalue <<= shift
 +
 +    subs    r12, r12, #1
 +    bne     stop_encode_loop
 +
 +    ldr     r10, [sp, #8]               ; *size
 +    ldr     r11, [r10]
 +    ldr     r4,  [r0, #vp9_writer_pos]  ; w->pos
 +    add     r11, r11, r4                ; *size += w->pos
 +    str     r11, [r10]
 +
 +    ldr     r9, [sp, #20]               ; num_parts
 +    sub     r9, r9, #1
 +    ldr     r10, [sp, #28]              ; i
 +    cmp     r10, r9                     ; if(i<(num_part - 1))
 +    bge     skip_write_partition
 +
 +    ldr     r12, [sp, #40]              ; ptr
 +    add     r12, r12, r4                ; ptr += w->pos
 +    str     r12, [sp, #40]
 +
 +    ldr     r9, [sp, #24]               ; cx_data
 +    mov     r8, r4, asr #8
 +    strb    r4, [r9, #0]
 +    strb    r8, [r9, #1]
 +    mov     r4, r4, asr #16
 +    strb    r4, [r9, #2]
 +
 +    add     r9, r9, #3                  ; cx_data += 3
 +    str     r9, [sp, #24]
 +
 +skip_write_partition
 +
 +    ldr     r11, [sp, #28]              ; i
 +    ldr     r10, [sp, #20]              ; num_parts
 +
 +    add     r11, r11, #1                ; i++
 +    str     r11, [sp, #28]
 +
 +    ldr     r7, [sp, #32]               ; cpi->tp_list[i]
 +    mov     r1, #TOKENLIST_SZ
 +    add     r7, r7, r1                  ; next element in cpi->tp_list
 +    str     r7, [sp, #32]               ; cpi->tp_list[i+1]
 +
 +    cmp     r10, r11
 +    bgt     numparts_loop
 +
 +
 +    add     sp, sp, #44
 +    pop     {r4-r11, pc}
 +    ENDP
 +
 +_VP8_COMP_common_
 +    DCD     vp8_comp_common
 +_VP8_COMMON_MBrows_
 +    DCD     vp8_common_mb_rows
 +_VP8_COMP_tplist_
 +    DCD     vp8_comp_tplist
 +_VP8_COMP_bc2_
 +    DCD     vp8_comp_bc2
 +
 +    END
diff --cc vp9/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
index ae2f6030d,000000000..bca74391a
mode 100644,000000..100644
--- a/vp9/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
+++ b/vp9/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
@@@ -1,224 -1,0 +1,224 @@@
 +;
 +;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 +;
 +;  Use of this source code is governed by a BSD-style license
 +;  that can be found in the LICENSE file in the root of the source
 +;  tree. An additional intellectual property rights grant can be found
 +;  in the file PATENTS.  All contributing project authors may
 +;  be found in the AUTHORS file in the root of the source tree.
 +;
 +
 +
 +    EXPORT  |vp8_fast_quantize_b_armv6|
 +
-     INCLUDE asm_enc_offsets.asm
++    INCLUDE vp9_asm_enc_offsets.asm
 +
 +    ARM
 +    REQUIRE8
 +    PRESERVE8
 +
 +    AREA ||.text||, CODE, READONLY, ALIGN=2
 +
 +; r0    BLOCK *b
 +; r1    BLOCKD *d
 +|vp8_fast_quantize_b_armv6| PROC
 +    stmfd   sp!, {r1, r4-r11, lr}
 +
 +    ldr     r3, [r0, #vp8_block_coeff]      ; coeff
 +    ldr     r4, [r0, #vp8_block_quant_fast] ; quant_fast
 +    ldr     r5, [r0, #vp8_block_round]      ; round
 +    ldr     r6, [r1, #vp8_blockd_qcoeff]    ; qcoeff
 +    ldr     r7, [r1, #vp8_blockd_dqcoeff]   ; dqcoeff
 +    ldr     r8, [r1, #vp8_blockd_dequant]   ; dequant
 +
 +    ldr     r2, loop_count          ; loop_count=0x1000000. 'lsls' instruction
 +                                    ; is used to update the counter so that
 +                                    ; it can be used to mark nonzero
 +                                    ; quantized coefficient pairs.
 +
 +    mov     r1, #0                  ; flags for quantized coeffs
 +
 +    ; PART 1: quantization and dequantization loop
 +loop
 +    ldr     r9, [r3], #4            ; [z1 | z0]
 +    ldr     r10, [r5], #4           ; [r1 | r0]
 +    ldr     r11, [r4], #4           ; [q1 | q0]
 +
 +    ssat16  lr, #1, r9              ; [sz1 | sz0]
 +    eor     r9, r9, lr              ; [z1 ^ sz1 | z0 ^ sz0]
 +    ssub16  r9, r9, lr              ; x = (z ^ sz) - sz
 +    sadd16  r9, r9, r10             ; [x1+r1 | x0+r0]
 +
 +    ldr     r12, [r3], #4           ; [z3 | z2]
 +
 +    smulbb  r0, r9, r11             ; [(x0+r0)*q0]
 +    smultt  r9, r9, r11             ; [(x1+r1)*q1]
 +
 +    ldr     r10, [r5], #4           ; [r3 | r2]
 +
 +    ssat16  r11, #1, r12            ; [sz3 | sz2]
 +    eor     r12, r12, r11           ; [z3 ^ sz3 | z2 ^ sz2]
 +    pkhtb   r0, r9, r0, asr #16     ; [y1 | y0]
 +    ldr     r9, [r4], #4            ; [q3 | q2]
 +    ssub16  r12, r12, r11           ; x = (z ^ sz) - sz
 +
 +    sadd16  r12, r12, r10           ; [x3+r3 | x2+r2]
 +
 +    eor     r0, r0, lr              ; [(y1 ^ sz1) | (y0 ^ sz0)]
 +
 +    smulbb  r10, r12, r9            ; [(x2+r2)*q2]
 +    smultt  r12, r12, r9            ; [(x3+r3)*q3]
 +
 +    ssub16  r0, r0, lr              ; x = (y ^ sz) - sz
 +
 +    cmp     r0, #0                  ; check if zero
 +    orrne   r1, r1, r2, lsr #24     ; add flag for nonzero coeffs
 +
 +    str     r0, [r6], #4            ; *qcoeff++ = x
 +    ldr     r9, [r8], #4            ; [dq1 | dq0]
 +
 +    pkhtb   r10, r12, r10, asr #16  ; [y3 | y2]
 +    eor     r10, r10, r11           ; [(y3 ^ sz3) | (y2 ^ sz2)]
 +    ssub16  r10, r10, r11           ; x = (y ^ sz) - sz
 +
 +    cmp     r10, #0                 ; check if zero
 +    orrne   r1, r1, r2, lsr #23     ; add flag for nonzero coeffs
 +
 +    str     r10, [r6], #4           ; *qcoeff++ = x
 +    ldr     r11, [r8], #4           ; [dq3 | dq2]
 +
 +    smulbb  r12, r0, r9             ; [x0*dq0]
 +    smultt  r0, r0, r9              ; [x1*dq1]
 +
 +    smulbb  r9, r10, r11            ; [x2*dq2]
 +    smultt  r10, r10, r11           ; [x3*dq3]
 +
 +    lsls    r2, r2, #2              ; update loop counter
 +    strh    r12, [r7, #0]           ; dqcoeff[0] = [x0*dq0]
 +    strh    r0, [r7, #2]            ; dqcoeff[1] = [x1*dq1]
 +    strh    r9, [r7, #4]            ; dqcoeff[2] = [x2*dq2]
 +    strh    r10, [r7, #6]           ; dqcoeff[3] = [x3*dq3]
 +    add     r7, r7, #8              ; dqcoeff += 8
 +    bne     loop
 +
 +    ; PART 2: check position for eob...
 +    mov     lr, #0                  ; init eob
 +    cmp     r1, #0                  ; coeffs after quantization?
 +    ldr     r11, [sp, #0]           ; restore BLOCKD pointer
 +    beq     end                     ; skip eob calculations if all zero
 +
 +    ldr     r0, [r11, #vp8_blockd_qcoeff]
 +
 +    ; check shortcut for nonzero qcoeffs
 +    tst    r1, #0x80
 +    bne    quant_coeff_15_14
 +    tst    r1, #0x20
 +    bne    quant_coeff_13_11
 +    tst    r1, #0x8
 +    bne    quant_coeff_12_7
 +    tst    r1, #0x40
 +    bne    quant_coeff_10_9
 +    tst    r1, #0x10
 +    bne    quant_coeff_8_3
 +    tst    r1, #0x2
 +    bne    quant_coeff_6_5
 +    tst    r1, #0x4
 +    bne    quant_coeff_4_2
 +    b      quant_coeff_1_0
 +
 +quant_coeff_15_14
 +    ldrh    r2, [r0, #30]       ; rc=15, i=15
 +    mov     lr, #16
 +    cmp     r2, #0
 +    bne     end
 +
 +    ldrh    r3, [r0, #28]       ; rc=14, i=14
 +    mov     lr, #15
 +    cmp     r3, #0
 +    bne     end
 +
 +quant_coeff_13_11
 +    ldrh    r2, [r0, #22]       ; rc=11, i=13
 +    mov     lr, #14
 +    cmp     r2, #0
 +    bne     end
 +
 +quant_coeff_12_7
 +    ldrh    r3, [r0, #14]       ; rc=7,  i=12
 +    mov     lr, #13
 +    cmp     r3, #0
 +    bne     end
 +
 +    ldrh    r2, [r0, #20]       ; rc=10, i=11
 +    mov     lr, #12
 +    cmp     r2, #0
 +    bne     end
 +
 +quant_coeff_10_9
 +    ldrh    r3, [r0, #26]       ; rc=13, i=10
 +    mov     lr, #11
 +    cmp     r3, #0
 +    bne     end
 +
 +    ldrh    r2, [r0, #24]       ; rc=12, i=9
 +    mov     lr, #10
 +    cmp     r2, #0
 +    bne     end
 +
 +quant_coeff_8_3
 +    ldrh    r3, [r0, #18]       ; rc=9,  i=8
 +    mov     lr, #9
 +    cmp     r3, #0
 +    bne     end
 +
 +    ldrh    r2, [r0, #12]       ; rc=6,  i=7
 +    mov     lr, #8
 +    cmp     r2, #0
 +    bne     end
 +
 +quant_coeff_6_5
 +    ldrh    r3, [r0, #6]        ; rc=3,  i=6
 +    mov     lr, #7
 +    cmp     r3, #0
 +    bne     end
 +
 +    ldrh    r2, [r0, #4]        ; rc=2,  i=5
 +    mov     lr, #6
 +    cmp     r2, #0
 +    bne     end
 +
 +quant_coeff_4_2
 +    ldrh    r3, [r0, #10]       ; rc=5,  i=4
 +    mov     lr, #5
 +    cmp     r3, #0
 +    bne     end
 +
 +    ldrh    r2, [r0, #16]       ; rc=8,  i=3
 +    mov     lr, #4
 +    cmp     r2, #0
 +    bne     end
 +
 +    ldrh    r3, [r0, #8]        ; rc=4,  i=2
 +    mov     lr, #3
 +    cmp     r3, #0
 +    bne     end
 +
 +quant_coeff_1_0
 +    ldrh    r2, [r0, #2]        ; rc=1,  i=1
 +    mov     lr, #2
 +    cmp     r2, #0
 +    bne     end
 +
 +    mov     lr, #1              ; rc=0,  i=0
 +
 +end
 +    str     lr, [r11, #vp8_blockd_eob]
 +    ldmfd   sp!, {r1, r4-r11, pc}
 +
 +    ENDP
 +
 +loop_count
 +    DCD     0x1000000
 +
 +    END
 +
diff --cc vp9/encoder/arm/armv6/vp8_subtract_armv6.asm
index 0ca74387b,000000000..bb466c4e9
mode 100644,000000..100644
--- a/vp9/encoder/arm/armv6/vp8_subtract_armv6.asm
+++ b/vp9/encoder/arm/armv6/vp8_subtract_armv6.asm
@@@ -1,265 -1,0 +1,265 @@@
 +;
 +;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 +;
 +;  Use of this source code is governed by a BSD-style license
 +;  that can be found in the LICENSE file in the root of the source
 +;  tree. An additional intellectual property rights grant can be found
 +;  in the file PATENTS.  All contributing project authors may
 +;  be found in the AUTHORS file in the root of the source tree.
 +;
 +
 +
 +    EXPORT  |vp8_subtract_mby_armv6|
 +    EXPORT  |vp8_subtract_mbuv_armv6|
 +    EXPORT  |vp8_subtract_b_armv6|
 +
-     INCLUDE asm_enc_offsets.asm
++    INCLUDE vp9_asm_enc_offsets.asm
 +
 +    ARM
 +    REQUIRE8
 +    PRESERVE8
 +
 +    AREA ||.text||, CODE, READONLY, ALIGN=2
 +
 +; r0    BLOCK *be
 +; r1    BLOCKD *bd
 +; r2    int pitch
 +|vp8_subtract_b_armv6| PROC
 +
 +    stmfd   sp!, {r4-r9}
 +
 +    ldr     r4, [r0, #vp8_block_base_src]
 +    ldr     r5, [r0, #vp8_block_src]
 +    ldr     r6, [r0, #vp8_block_src_diff]
 +
 +    ldr     r3, [r4]
 +    ldr     r7, [r0, #vp8_block_src_stride]
 +    add     r3, r3, r5          ; src = *base_src + src
 +    ldr     r8, [r1, #vp8_blockd_predictor]
 +
 +    mov     r9, #4              ; loop count
 +
 +loop_block
 +
 +    ldr     r0, [r3], r7        ; src
 +    ldr     r1, [r8], r2        ; pred
 +
 +    uxtb16  r4, r0              ; [s2 | s0]
 +    uxtb16  r5, r1              ; [p2 | p0]
 +    uxtb16  r0, r0, ror #8      ; [s3 | s1]
 +    uxtb16  r1, r1, ror #8      ; [p3 | p1]
 +
 +    usub16  r4, r4, r5          ; [d2 | d0]
 +    usub16  r5, r0, r1          ; [d3 | d1]
 +
 +    subs    r9, r9, #1          ; decrement loop counter
 +
 +    pkhbt   r0, r4, r5, lsl #16 ; [d1 | d0]
 +    pkhtb   r1, r5, r4, asr #16 ; [d3 | d2]
 +
 +    str     r0, [r6, #0]        ; diff
 +    str     r1, [r6, #4]        ; diff
 +
 +    add     r6, r6, r2, lsl #1  ; update diff pointer
 +    bne     loop_block
 +
 +    ldmfd   sp!, {r4-r9}
 +    mov     pc, lr
 +
 +    ENDP
 +
 +
 +; r0    short *diff
 +; r1    unsigned char *usrc
 +; r2    unsigned char *vsrc
 +; r3    unsigned char *pred
 +; stack int stride
 +|vp8_subtract_mbuv_armv6| PROC
 +
 +    stmfd   sp!, {r4-r12, lr}
 +
 +    add     r0, r0, #512        ; set *diff point to Cb
 +    add     r3, r3, #256        ; set *pred point to Cb
 +
 +    mov     r4, #8              ; loop count
 +    ldr     r5, [sp, #40]       ; stride
 +
 +    ; Subtract U block
 +loop_u
 +    ldr     r6, [r1]            ; src       (A)
 +    ldr     r7, [r3], #4        ; pred      (A)
 +
 +    uxtb16  r8, r6              ; [s2 | s0] (A)
 +    uxtb16  r9, r7              ; [p2 | p0] (A)
 +    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
 +    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
 +
 +    usub16  r6, r8, r9          ; [d2 | d0] (A)
 +    usub16  r7, r10, r11        ; [d3 | d1] (A)
 +
 +    ldr     r10, [r1, #4]       ; src       (B)
 +    ldr     r11, [r3], #4       ; pred      (B)
 +
 +    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
 +    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
 +
 +    str     r8, [r0], #4        ; diff      (A)
 +    uxtb16  r8, r10             ; [s2 | s0] (B)
 +    str     r9, [r0], #4        ; diff      (A)
 +
 +    uxtb16  r9, r11             ; [p2 | p0] (B)
 +    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
 +    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
 +
 +    usub16  r6, r8, r9          ; [d2 | d0] (B)
 +    usub16  r7, r10, r11        ; [d3 | d1] (B)
 +
 +    add     r1, r1, r5          ; update usrc pointer
 +
 +    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
 +    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
 +
 +    str     r8, [r0], #4        ; diff      (B)
 +    subs    r4, r4, #1          ; update loop counter
 +    str     r9, [r0], #4        ; diff      (B)
 +
 +    bne     loop_u
 +
 +    mov     r4, #8              ; loop count
 +
 +    ; Subtract V block
 +loop_v
 +    ldr     r6, [r2]            ; src       (A)
 +    ldr     r7, [r3], #4        ; pred      (A)
 +
 +    uxtb16  r8, r6              ; [s2 | s0] (A)
 +    uxtb16  r9, r7              ; [p2 | p0] (A)
 +    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
 +    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
 +
 +    usub16  r6, r8, r9          ; [d2 | d0] (A)
 +    usub16  r7, r10, r11        ; [d3 | d1] (A)
 +
 +    ldr     r10, [r2, #4]       ; src       (B)
 +    ldr     r11, [r3], #4       ; pred      (B)
 +
 +    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
 +    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
 +
 +    str     r8, [r0], #4        ; diff      (A)
 +    uxtb16  r8, r10             ; [s2 | s0] (B)
 +    str     r9, [r0], #4        ; diff      (A)
 +
 +    uxtb16  r9, r11             ; [p2 | p0] (B)
 +    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
 +    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
 +
 +    usub16  r6, r8, r9          ; [d2 | d0] (B)
 +    usub16  r7, r10, r11        ; [d3 | d1] (B)
 +
 +    add     r2, r2, r5          ; update vsrc pointer
 +
 +    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
 +    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
 +
 +    str     r8, [r0], #4        ; diff      (B)
 +    subs    r4, r4, #1          ; update loop counter
 +    str     r9, [r0], #4        ; diff      (B)
 +
 +    bne     loop_v
 +
 +    ldmfd   sp!, {r4-r12, pc}
 +
 +    ENDP
 +
 +
 +; r0    short *diff
 +; r1    unsigned char *src
 +; r2    unsigned char *pred
 +; r3    int stride
 +|vp8_subtract_mby_armv6| PROC
 +
 +    stmfd   sp!, {r4-r11}
 +
 +    mov     r4, #16
 +loop
 +    ldr     r6, [r1]            ; src       (A)
 +    ldr     r7, [r2], #4        ; pred      (A)
 +
 +    uxtb16  r8, r6              ; [s2 | s0] (A)
 +    uxtb16  r9, r7              ; [p2 | p0] (A)
 +    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
 +    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
 +
 +    usub16  r6, r8, r9          ; [d2 | d0] (A)
 +    usub16  r7, r10, r11        ; [d3 | d1] (A)
 +
 +    ldr     r10, [r1, #4]       ; src       (B)
 +    ldr     r11, [r2], #4       ; pred      (B)
 +
 +    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
 +    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
 +
 +    str     r8, [r0], #4        ; diff      (A)
 +    uxtb16  r8, r10             ; [s2 | s0] (B)
 +    str     r9, [r0], #4        ; diff      (A)
 +
 +    uxtb16  r9, r11             ; [p2 | p0] (B)
 +    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
 +    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
 +
 +    usub16  r6, r8, r9          ; [d2 | d0] (B)
 +    usub16  r7, r10, r11        ; [d3 | d1] (B)
 +
 +    ldr     r10, [r1, #8]       ; src       (C)
 +    ldr     r11, [r2], #4       ; pred      (C)
 +
 +    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
 +    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
 +
 +    str     r8, [r0], #4        ; diff      (B)
 +    uxtb16  r8, r10             ; [s2 | s0] (C)
 +    str     r9, [r0], #4        ; diff      (B)
 +
 +    uxtb16  r9, r11             ; [p2 | p0] (C)
 +    uxtb16  r10, r10, ror #8    ; [s3 | s1] (C)
 +    uxtb16  r11, r11, ror #8    ; [p3 | p1] (C)
 +
 +    usub16  r6, r8, r9          ; [d2 | d0] (C)
 +    usub16  r7, r10, r11        ; [d3 | d1] (C)
 +
 +    ldr     r10, [r1, #12]      ; src       (D)
 +    ldr     r11, [r2], #4       ; pred      (D)
 +
 +    pkhbt   r8, r6, r7, lsl #16  ; [d1 | d0] (C)
 +    pkhtb   r9, r7, r6, asr #16  ; [d3 | d2] (C)
 +
 +    str     r8, [r0], #4        ; diff      (C)
 +    uxtb16  r8, r10             ; [s2 | s0] (D)
 +    str     r9, [r0], #4        ; diff      (C)
 +
 +    uxtb16  r9, r11             ; [p2 | p0] (D)
 +    uxtb16  r10, r10, ror #8    ; [s3 | s1] (D)
 +    uxtb16  r11, r11, ror #8    ; [p3 | p1] (D)
 +
 +    usub16  r6, r8, r9          ; [d2 | d0] (D)
 +    usub16  r7, r10, r11        ; [d3 | d1] (D)
 +
 +    add     r1, r1, r3          ; update src pointer
 +
 +    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (D)
 +    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (D)
 +
 +    str     r8, [r0], #4        ; diff      (D)
 +    subs    r4, r4, #1          ; update loop counter
 +    str     r9, [r0], #4        ; diff      (D)
 +
 +    bne     loop
 +
 +    ldmfd   sp!, {r4-r11}
 +    mov     pc, lr
 +
 +    ENDP
 +
 +    END
 +
diff --cc vp9/encoder/arm/neon/fastquantizeb_neon.asm
index 259707658,000000000..fa3aff8ac
mode 100644,000000..100644
--- a/vp9/encoder/arm/neon/fastquantizeb_neon.asm
+++ b/vp9/encoder/arm/neon/fastquantizeb_neon.asm
@@@ -1,261 -1,0 +1,261 @@@
 +;
 +;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 +;
 +;  Use of this source code is governed by a BSD-style license
 +;  that can be found in the LICENSE file in the root of the source
 +;  tree. An additional intellectual property rights grant can be found
 +;  in the file PATENTS.  All contributing project authors may
 +;  be found in the AUTHORS file in the root of the source tree.
 +;
 +
 +
 +    EXPORT  |vp8_fast_quantize_b_neon|
 +    EXPORT  |vp8_fast_quantize_b_pair_neon|
 +
-     INCLUDE asm_enc_offsets.asm
++    INCLUDE vp9_asm_enc_offsets.asm
 +
 +    ARM
 +    REQUIRE8
 +    PRESERVE8
 +
 +    AREA ||.text||, CODE, READONLY, ALIGN=4
 +
 +;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
 +|vp8_fast_quantize_b_pair_neon| PROC
 +
 +    stmfd           sp!, {r4-r9}
 +    vstmdb          sp!, {q4-q7}
 +
 +    ldr             r4, [r0, #vp8_block_coeff]
 +    ldr             r5, [r0, #vp8_block_quant_fast]
 +    ldr             r6, [r0, #vp8_block_round]
 +
 +    vld1.16         {q0, q1}, [r4@128]  ; load z
 +
 +    ldr             r7, [r2, #vp8_blockd_qcoeff]
 +
 +    vabs.s16        q4, q0              ; calculate x = abs(z)
 +    vabs.s16        q5, q1
 +
 +    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
 +    vshr.s16        q2, q0, #15         ; sz
 +    vshr.s16        q3, q1, #15
 +
 +    vld1.s16        {q6, q7}, [r6@128]  ; load round_ptr [0-15]
 +    vld1.s16        {q8, q9}, [r5@128]  ; load quant_ptr [0-15]
 +
 +    ldr             r4, [r1, #vp8_block_coeff]
 +
 +    vadd.s16        q4, q6              ; x + Round
 +    vadd.s16        q5, q7
 +
 +    vld1.16         {q0, q1}, [r4@128]  ; load z2
 +
 +    vqdmulh.s16     q4, q8              ; y = ((Round+abs(z)) * Quant) >> 16
 +    vqdmulh.s16     q5, q9
 +
 +    vabs.s16        q10, q0             ; calculate x2 = abs(z_2)
 +    vabs.s16        q11, q1
 +    vshr.s16        q12, q0, #15        ; sz2
 +    vshr.s16        q13, q1, #15
 +
 +    ;modify data to have its original sign
 +    veor.s16        q4, q2              ; y^sz
 +    veor.s16        q5, q3
 +
 +    vadd.s16        q10, q6             ; x2 + Round
 +    vadd.s16        q11, q7
 +
 +    ldr             r8, [r2, #vp8_blockd_dequant]
 +
 +    vqdmulh.s16     q10, q8             ; y2 = ((Round+abs(z)) * Quant) >> 16
 +    vqdmulh.s16     q11, q9
 +
 +    vshr.s16        q4, #1              ; right shift 1 after vqdmulh
 +    vshr.s16        q5, #1
 +
 +    vld1.s16        {q6, q7}, [r8@128]  ;load dequant_ptr[i]
 +
 +    vsub.s16        q4, q2              ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
 +    vsub.s16        q5, q3
 +
 +    vshr.s16        q10, #1             ; right shift 1 after vqdmulh
 +    vshr.s16        q11, #1
 +
 +    ldr             r9, [r2, #vp8_blockd_dqcoeff]
 +
 +    veor.s16        q10, q12            ; y2^sz2
 +    veor.s16        q11, q13
 +
 +    vst1.s16        {q4, q5}, [r7]      ; store: qcoeff = x1
 +
 +
 +    vsub.s16        q10, q12            ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
 +    vsub.s16        q11, q13
 +
 +    ldr             r6, [r3, #vp8_blockd_qcoeff]
 +
 +    vmul.s16        q2, q6, q4          ; x * Dequant
 +    vmul.s16        q3, q7, q5
 +
 +    ldr             r0, _inv_zig_zag_   ; load ptr of inverse zigzag table
 +
 +    vceq.s16        q8, q8              ; set q8 to all 1
 +
 +    vst1.s16        {q10, q11}, [r6]    ; store: qcoeff = x2
 +
 +    vmul.s16        q12, q6, q10        ; x2 * Dequant
 +    vmul.s16        q13, q7, q11
 +
 +    vld1.16         {q6, q7}, [r0@128]  ; load inverse scan order
 +
 +    vtst.16         q14, q4, q8         ; now find eob
 +    vtst.16         q15, q5, q8         ; non-zero element is set to all 1
 +
 +    vst1.s16        {q2, q3}, [r9]      ; store dqcoeff = x * Dequant
 +
 +    ldr             r7, [r3, #vp8_blockd_dqcoeff]
 +
 +    vand            q0, q6, q14         ; get all valid numbers from scan array
 +    vand            q1, q7, q15
 +
 +    vst1.s16        {q12, q13}, [r7]    ; store dqcoeff = x * Dequant
 +
 +    vtst.16         q2, q10, q8         ; now find eob
 +    vtst.16         q3, q11, q8         ; non-zero element is set to all 1
 +
 +    vmax.u16        q0, q0, q1          ; find maximum value in q0, q1
 +
 +    vand            q10, q6, q2         ; get all valid numbers from scan array
 +    vand            q11, q7, q3
 +    vmax.u16        q10, q10, q11       ; find maximum value in q10, q11
 +
 +    vmax.u16        d0, d0, d1
 +    vmax.u16        d20, d20, d21
 +    vmovl.u16       q0, d0
 +    vmovl.u16       q10, d20
 +
 +
 +    vmax.u32        d0, d0, d1
 +    vmax.u32        d20, d20, d21
 +    vpmax.u32       d0, d0, d0
 +    vpmax.u32       d20, d20, d20
 +
 +    add             r4, r2, #vp8_blockd_eob
 +    add             r5, r3, #vp8_blockd_eob
 +
 +    vst1.32         {d0[0]}, [r4@32]
 +    vst1.32         {d20[0]}, [r5@32]
 +
 +    vldmia          sp!, {q4-q7}
 +    ldmfd           sp!, {r4-r9}
 +    bx              lr
 +
 +    ENDP
 +
 +;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
 +|vp8_fast_quantize_b_neon| PROC
 +
 +    stmfd           sp!, {r4-r7}
 +
 +    ldr             r3, [r0, #vp8_block_coeff]
 +    ldr             r4, [r0, #vp8_block_quant_fast]
 +    ldr             r5, [r0, #vp8_block_round]
 +
 +    vld1.16         {q0, q1}, [r3@128]  ; load z
 +    vorr.s16        q14, q0, q1         ; check if all zero (step 1)
 +    ldr             r6, [r1, #vp8_blockd_qcoeff]
 +    ldr             r7, [r1, #vp8_blockd_dqcoeff]
 +    vorr.s16        d28, d28, d29       ; check if all zero (step 2)
 +
 +    vabs.s16        q12, q0             ; calculate x = abs(z)
 +    vabs.s16        q13, q1
 +
 +    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
 +    vshr.s16        q2, q0, #15         ; sz
 +    vmov            r2, r3, d28         ; check if all zero (step 3)
 +    vshr.s16        q3, q1, #15
 +
 +    vld1.s16        {q14, q15}, [r5@128]; load round_ptr [0-15]
 +    vld1.s16        {q8, q9}, [r4@128]  ; load quant_ptr [0-15]
 +
 +    vadd.s16        q12, q14            ; x + Round
 +    vadd.s16        q13, q15
 +
 +    ldr             r0, _inv_zig_zag_   ; load ptr of inverse zigzag table
 +
 +    vqdmulh.s16     q12, q8             ; y = ((Round+abs(z)) * Quant) >> 16
 +    vqdmulh.s16     q13, q9
 +
 +    vld1.16         {q10, q11}, [r0@128]; load inverse scan order
 +
 +    vceq.s16        q8, q8              ; set q8 to all 1
 +
 +    ldr             r4, [r1, #vp8_blockd_dequant]
 +
 +    vshr.s16        q12, #1             ; right shift 1 after vqdmulh
 +    vshr.s16        q13, #1
 +
 +    orr             r2, r2, r3          ; check if all zero (step 4)
 +    cmp             r2, #0              ; check if all zero (step 5)
 +    beq             zero_output         ; check if all zero (step 6)
 +
 +    ;modify data to have its original sign
 +    veor.s16        q12, q2             ; y^sz
 +    veor.s16        q13, q3
 +
 +    vsub.s16        q12, q2             ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
 +    vsub.s16        q13, q3
 +
 +    vld1.s16        {q2, q3}, [r4@128]  ; load dequant_ptr[i]
 +
 +    vtst.16         q14, q12, q8        ; now find eob
 +    vtst.16         q15, q13, q8        ; non-zero element is set to all 1
 +
 +    vst1.s16        {q12, q13}, [r6@128]; store: qcoeff = x1
 +
 +    vand            q10, q10, q14       ; get all valid numbers from scan array
 +    vand            q11, q11, q15
 +
 +
 +    vmax.u16        q0, q10, q11        ; find maximum value in q0, q1
 +    vmax.u16        d0, d0, d1
 +    vmovl.u16       q0, d0
 +
 +    vmul.s16        q2, q12             ; x * Dequant
 +    vmul.s16        q3, q13
 +
 +    vmax.u32        d0, d0, d1
 +    vpmax.u32       d0, d0, d0
 +
 +    vst1.s16        {q2, q3}, [r7@128]  ; store dqcoeff = x * Dequant
 +
 +    add             r4, r1, #vp8_blockd_eob
 +    vst1.32         {d0[0]}, [r4@32]
 +
 +    ldmfd           sp!, {r4-r7}
 +    bx              lr
 +
 +zero_output
 +    str             r2, [r1, #vp8_blockd_eob]
 +    vst1.s16        {q0, q1}, [r6@128]  ; qcoeff = 0
 +    vst1.s16        {q0, q1}, [r7@128]  ; dqcoeff = 0
 +
 +    ldmfd           sp!, {r4-r7}
 +    bx              lr
 +
 +    ENDP
 +
 +; default inverse zigzag table is defined in vp9/common/entropy.c
 +_inv_zig_zag_
 +    DCD inv_zig_zag
 +
 +    ALIGN 16    ; enable use of @128 bit aligned loads
 +inv_zig_zag
 +    DCW 0x0001, 0x0002, 0x0006, 0x0007
 +    DCW 0x0003, 0x0005, 0x0008, 0x000d
 +    DCW 0x0004, 0x0009, 0x000c, 0x000e
 +    DCW 0x000a, 0x000b, 0x000f, 0x0010
 +
 +    END
 +
diff --cc vp9/encoder/arm/neon/subtract_neon.asm
index 68c295062,000000000..eab14868e
mode 100644,000000..100644
--- a/vp9/encoder/arm/neon/subtract_neon.asm
+++ b/vp9/encoder/arm/neon/subtract_neon.asm
@@@ -1,185 -1,0 +1,185 @@@
 +;
 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 +;
 +;  Use of this source code is governed by a BSD-style license
 +;  that can be found in the LICENSE file in the root of the source
 +;  tree. An additional intellectual property rights grant can be found
 +;  in the file PATENTS.  All contributing project authors may
 +;  be found in the AUTHORS file in the root of the source tree.
 +;
 +
 +    EXPORT |vp8_subtract_b_neon|
 +    EXPORT |vp8_subtract_mby_neon|
 +    EXPORT |vp8_subtract_mbuv_neon|
 +
-     INCLUDE asm_enc_offsets.asm
++    INCLUDE vp9_asm_enc_offsets.asm
 +
 +    ARM
 +    REQUIRE8
 +    PRESERVE8
 +
 +    AREA ||.text||, CODE, READONLY, ALIGN=2
 +
 +;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
 +|vp8_subtract_b_neon| PROC
 +
 +    stmfd   sp!, {r4-r7}
 +
 +    ldr     r3, [r0, #vp8_block_base_src]
 +    ldr     r4, [r0, #vp8_block_src]
 +    ldr     r5, [r0, #vp8_block_src_diff]
 +    ldr     r3, [r3]
 +    ldr     r6, [r0, #vp8_block_src_stride]
 +    add     r3, r3, r4                      ; src = *base_src + src
 +    ldr     r7, [r1, #vp8_blockd_predictor]
 +
 +    vld1.8          {d0}, [r3], r6          ;load src
 +    vld1.8          {d1}, [r7], r2          ;load pred
 +    vld1.8          {d2}, [r3], r6
 +    vld1.8          {d3}, [r7], r2
 +    vld1.8          {d4}, [r3], r6
 +    vld1.8          {d5}, [r7], r2
 +    vld1.8          {d6}, [r3], r6
 +    vld1.8          {d7}, [r7], r2
 +
 +    vsubl.u8        q10, d0, d1
 +    vsubl.u8        q11, d2, d3
 +    vsubl.u8        q12, d4, d5
 +    vsubl.u8        q13, d6, d7
 +
 +    mov             r2, r2, lsl #1
 +
 +    vst1.16         {d20}, [r5], r2         ;store diff
 +    vst1.16         {d22}, [r5], r2
 +    vst1.16         {d24}, [r5], r2
 +    vst1.16         {d26}, [r5], r2
 +
 +    ldmfd   sp!, {r4-r7}
 +    bx              lr
 +
 +    ENDP
 +
 +
 +;==========================================
 +;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
 +|vp8_subtract_mby_neon| PROC
 +    mov             r12, #4
 +
 +subtract_mby_loop
 +    vld1.8          {q0}, [r1], r3          ;load src
 +    vld1.8          {q1}, [r2]!             ;load pred
 +    vld1.8          {q2}, [r1], r3
 +    vld1.8          {q3}, [r2]!
 +    vld1.8          {q4}, [r1], r3
 +    vld1.8          {q5}, [r2]!
 +    vld1.8          {q6}, [r1], r3
 +    vld1.8          {q7}, [r2]!
 +
 +    vsubl.u8        q8, d0, d2
 +    vsubl.u8        q9, d1, d3
 +    vsubl.u8        q10, d4, d6
 +    vsubl.u8        q11, d5, d7
 +    vsubl.u8        q12, d8, d10
 +    vsubl.u8        q13, d9, d11
 +    vsubl.u8        q14, d12, d14
 +    vsubl.u8        q15, d13, d15
 +
 +    vst1.16         {q8}, [r0]!             ;store diff
 +    vst1.16         {q9}, [r0]!
 +    vst1.16         {q10}, [r0]!
 +    vst1.16         {q11}, [r0]!
 +    vst1.16         {q12}, [r0]!
 +    vst1.16         {q13}, [r0]!
 +    vst1.16         {q14}, [r0]!
 +    vst1.16         {q15}, [r0]!
 +
 +    subs            r12, r12, #1
 +    bne             subtract_mby_loop
 +
 +    bx              lr
 +    ENDP
 +
 +;=================================
 +;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
 +|vp8_subtract_mbuv_neon| PROC
 +    ldr             r12, [sp]
 +
 +;u
 +    add             r0, r0, #512        ;   short *udiff = diff + 256;
 +    add             r3, r3, #256        ;   unsigned char *upred = pred + 256;
 +
 +    vld1.8          {d0}, [r1], r12         ;load src
 +    vld1.8          {d1}, [r3]!             ;load pred
 +    vld1.8          {d2}, [r1], r12
 +    vld1.8          {d3}, [r3]!
 +    vld1.8          {d4}, [r1], r12
 +    vld1.8          {d5}, [r3]!
 +    vld1.8          {d6}, [r1], r12
 +    vld1.8          {d7}, [r3]!
 +    vld1.8          {d8}, [r1], r12
 +    vld1.8          {d9}, [r3]!
 +    vld1.8          {d10}, [r1], r12
 +    vld1.8          {d11}, [r3]!
 +    vld1.8          {d12}, [r1], r12
 +    vld1.8          {d13}, [r3]!
 +    vld1.8          {d14}, [r1], r12
 +    vld1.8          {d15}, [r3]!
 +
 +    vsubl.u8        q8, d0, d1
 +    vsubl.u8        q9, d2, d3
 +    vsubl.u8        q10, d4, d5
 +    vsubl.u8        q11, d6, d7
 +    vsubl.u8        q12, d8, d9
 +    vsubl.u8        q13, d10, d11
 +    vsubl.u8        q14, d12, d13
 +    vsubl.u8        q15, d14, d15
 +
 +    vst1.16         {q8}, [r0]!             ;store diff
 +    vst1.16         {q9}, [r0]!
 +    vst1.16         {q10}, [r0]!
 +    vst1.16         {q11}, [r0]!
 +    vst1.16         {q12}, [r0]!
 +    vst1.16         {q13}, [r0]!
 +    vst1.16         {q14}, [r0]!
 +    vst1.16         {q15}, [r0]!
 +
 +;v
 +    vld1.8          {d0}, [r2], r12         ;load src
 +    vld1.8          {d1}, [r3]!             ;load pred
 +    vld1.8          {d2}, [r2], r12
 +    vld1.8          {d3}, [r3]!
 +    vld1.8          {d4}, [r2], r12
 +    vld1.8          {d5}, [r3]!
 +    vld1.8          {d6}, [r2], r12
 +    vld1.8          {d7}, [r3]!
 +    vld1.8          {d8}, [r2], r12
 +    vld1.8          {d9}, [r3]!
 +    vld1.8          {d10}, [r2], r12
 +    vld1.8          {d11}, [r3]!
 +    vld1.8          {d12}, [r2], r12
 +    vld1.8          {d13}, [r3]!
 +    vld1.8          {d14}, [r2], r12
 +    vld1.8          {d15}, [r3]!
 +
 +    vsubl.u8        q8, d0, d1
 +    vsubl.u8        q9, d2, d3
 +    vsubl.u8        q10, d4, d5
 +    vsubl.u8        q11, d6, d7
 +    vsubl.u8        q12, d8, d9
 +    vsubl.u8        q13, d10, d11
 +    vsubl.u8        q14, d12, d13
 +    vsubl.u8        q15, d14, d15
 +
 +    vst1.16         {q8}, [r0]!             ;store diff
 +    vst1.16         {q9}, [r0]!
 +    vst1.16         {q10}, [r0]!
 +    vst1.16         {q11}, [r0]!
 +    vst1.16         {q12}, [r0]!
 +    vst1.16         {q13}, [r0]!
 +    vst1.16         {q14}, [r0]!
 +    vst1.16         {q15}, [r0]!
 +
 +    bx              lr
 +    ENDP
 +
 +    END
diff --cc vp9/encoder/x86/quantize_sse2.asm
index 9b563c514,000000000..af6aa6b3b
mode 100644,000000..100644
--- a/vp9/encoder/x86/quantize_sse2.asm
+++ b/vp9/encoder/x86/quantize_sse2.asm
@@@ -1,380 -1,0 +1,380 @@@
 +;
 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 +;
 +;  Use of this source code is governed by a BSD-style license and patent
 +;  grant that can be found in the LICENSE file in the root of the source
 +;  tree. All contributing project authors may be found in the AUTHORS
 +;  file in the root of the source tree.
 +;
 +
 +
 +%include "vpx_ports/x86_abi_support.asm"
- %include "asm_enc_offsets.asm"
++%include "vp9_asm_enc_offsets.asm"
 +
 +
 +; void vp9_regular_quantize_b_sse2 | arg
 +;  (BLOCK  *b,                     |  0
 +;   BLOCKD *d)                     |  1
 +
 +global sym(vp9_regular_quantize_b_sse2)
 +sym(vp9_regular_quantize_b_sse2):
 +    push        rbp
 +    mov         rbp, rsp
 +    SAVE_XMM 7
 +    GET_GOT     rbx
 +
 +%if ABI_IS_32BIT
 +    push        rdi
 +    push        rsi
 +%else
 +  %ifidn __OUTPUT_FORMAT__,x64
 +    push        rdi
 +    push        rsi
 +  %endif
 +%endif
 +
 +    ALIGN_STACK 16, rax
 +    %define zrun_zbin_boost   0  ;  8
 +    %define abs_minus_zbin    8  ; 32
 +    %define temp_qcoeff       40 ; 32
 +    %define qcoeff            72 ; 32
 +    %define stack_size        104
 +    sub         rsp, stack_size
 +    ; end prolog
 +
 +%if ABI_IS_32BIT
 +    mov         rdi, arg(0)                 ; BLOCK *b
 +    mov         rsi, arg(1)                 ; BLOCKD *d
 +%else
 +  %ifidn __OUTPUT_FORMAT__,x64
 +    mov         rdi, rcx                    ; BLOCK *b
 +    mov         rsi, rdx                    ; BLOCKD *d
 +  %else
 +    ;mov         rdi, rdi                    ; BLOCK *b
 +    ;mov         rsi, rsi                    ; BLOCKD *d
 +  %endif
 +%endif
 +
 +    mov         rdx, [rdi + vp9_block_coeff] ; coeff_ptr
 +    mov         rcx, [rdi + vp9_block_zbin] ; zbin_ptr
 +    movd        xmm7, [rdi + vp9_block_zbin_extra] ; zbin_oq_value
 +
 +    ; z
 +    movdqa      xmm0, [rdx]
 +    movdqa      xmm4, [rdx + 16]
 +    mov         rdx, [rdi + vp9_block_round] ; round_ptr
 +
 +    pshuflw     xmm7, xmm7, 0
 +    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
 +
 +    movdqa      xmm1, xmm0
 +    movdqa      xmm5, xmm4
 +
 +    ; sz
 +    psraw       xmm0, 15
 +    psraw       xmm4, 15
 +
 +    ; (z ^ sz)
 +    pxor        xmm1, xmm0
 +    pxor        xmm5, xmm4
 +
 +    ; x = abs(z)
 +    psubw       xmm1, xmm0
 +    psubw       xmm5, xmm4
 +
 +    movdqa      xmm2, [rcx]
 +    movdqa      xmm3, [rcx + 16]
 +    mov         rcx, [rdi + vp9_block_quant] ; quant_ptr
 +
 +    ; *zbin_ptr + zbin_oq_value
 +    paddw       xmm2, xmm7
 +    paddw       xmm3, xmm7
 +
 +    ; x - (*zbin_ptr + zbin_oq_value)
 +    psubw       xmm1, xmm2
 +    psubw       xmm5, xmm3
 +    movdqa      [rsp + abs_minus_zbin], xmm1
 +    movdqa      [rsp + abs_minus_zbin + 16], xmm5
 +
 +    ; add (zbin_ptr + zbin_oq_value) back
 +    paddw       xmm1, xmm2
 +    paddw       xmm5, xmm3
 +
 +    movdqa      xmm2, [rdx]
 +    movdqa      xmm6, [rdx + 16]
 +
 +    movdqa      xmm3, [rcx]
 +    movdqa      xmm7, [rcx + 16]
 +
 +    ; x + round
 +    paddw       xmm1, xmm2
 +    paddw       xmm5, xmm6
 +
 +    ; y = x * quant_ptr >> 16
 +    pmulhw      xmm3, xmm1
 +    pmulhw      xmm7, xmm5
 +
 +    ; y += x
 +    paddw       xmm1, xmm3
 +    paddw       xmm5, xmm7
 +
 +    movdqa      [rsp + temp_qcoeff], xmm1
 +    movdqa      [rsp + temp_qcoeff + 16], xmm5
 +
 +    pxor        xmm6, xmm6
 +    ; zero qcoeff
 +    movdqa      [rsp + qcoeff], xmm6
 +    movdqa      [rsp + qcoeff + 16], xmm6
 +
 +    mov         rdx, [rdi + vp9_block_zrun_zbin_boost] ; zbin_boost_ptr
 +    mov         rax, [rdi + vp9_block_quant_shift] ; quant_shift_ptr
 +    mov         [rsp + zrun_zbin_boost], rdx
 +
 +%macro ZIGZAG_LOOP 1
 +    ; x
 +    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
 +
 +    ; if (x >= zbin)
 +    sub         cx, WORD PTR[rdx]           ; x - zbin
 +    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
 +    jl          .rq_zigzag_loop_%1           ; x < zbin
 +
 +    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
 +
 +    ; downshift by quant_shift[rc]
 +    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
 +    sar         edi, cl                     ; also sets Z bit
 +    je          .rq_zigzag_loop_%1           ; !y
 +    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
 +    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
 +.rq_zigzag_loop_%1:
 +%endmacro
 +; in vp9_default_zig_zag1d order: see vp9/common/entropy.c
 +ZIGZAG_LOOP  0
 +ZIGZAG_LOOP  1
 +ZIGZAG_LOOP  4
 +ZIGZAG_LOOP  8
 +ZIGZAG_LOOP  5
 +ZIGZAG_LOOP  2
 +ZIGZAG_LOOP  3
 +ZIGZAG_LOOP  6
 +ZIGZAG_LOOP  9
 +ZIGZAG_LOOP 12
 +ZIGZAG_LOOP 13
 +ZIGZAG_LOOP 10
 +ZIGZAG_LOOP  7
 +ZIGZAG_LOOP 11
 +ZIGZAG_LOOP 14
 +ZIGZAG_LOOP 15
 +
 +    movdqa      xmm2, [rsp + qcoeff]
 +    movdqa      xmm3, [rsp + qcoeff + 16]
 +
 +    mov         rcx, [rsi + vp9_blockd_dequant] ; dequant_ptr
 +    mov         rdi, [rsi + vp9_blockd_dqcoeff] ; dqcoeff_ptr
 +
 +    ; y ^ sz
 +    pxor        xmm2, xmm0
 +    pxor        xmm3, xmm4
 +    ; x = (y ^ sz) - sz
 +    psubw       xmm2, xmm0
 +    psubw       xmm3, xmm4
 +
 +    ; dequant
 +    movdqa      xmm0, [rcx]
 +    movdqa      xmm1, [rcx + 16]
 +
 +    mov         rcx, [rsi + vp9_blockd_qcoeff] ; qcoeff_ptr
 +
 +    pmullw      xmm0, xmm2
 +    pmullw      xmm1, xmm3
 +
 +    movdqa      [rcx], xmm2        ; store qcoeff
 +    movdqa      [rcx + 16], xmm3
 +    movdqa      [rdi], xmm0        ; store dqcoeff
 +    movdqa      [rdi + 16], xmm1
 +
 +    ; select the last value (in zig_zag order) for EOB
 +    pcmpeqw     xmm2, xmm6
 +    pcmpeqw     xmm3, xmm6
 +    ; !
 +    pcmpeqw     xmm6, xmm6
 +    pxor        xmm2, xmm6
 +    pxor        xmm3, xmm6
 +    ; mask inv_zig_zag
 +    pand        xmm2, [GLOBAL(inv_zig_zag)]
 +    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
 +    ; select the max value
 +    pmaxsw      xmm2, xmm3
 +    pshufd      xmm3, xmm2, 00001110b
 +    pmaxsw      xmm2, xmm3
 +    pshuflw     xmm3, xmm2, 00001110b
 +    pmaxsw      xmm2, xmm3
 +    pshuflw     xmm3, xmm2, 00000001b
 +    pmaxsw      xmm2, xmm3
 +    movd        eax, xmm2
 +    and         eax, 0xff
 +    mov         [rsi + vp9_blockd_eob], eax
 +
 +    ; begin epilog
 +    add         rsp, stack_size
 +    pop         rsp
 +%if ABI_IS_32BIT
 +    pop         rsi
 +    pop         rdi
 +%else
 +  %ifidn __OUTPUT_FORMAT__,x64
 +    pop         rsi
 +    pop         rdi
 +  %endif
 +%endif
 +    RESTORE_GOT
 +    RESTORE_XMM
 +    pop         rbp
 +    ret
 +
 +; void vp9_fast_quantize_b_sse2 | arg
 +;  (BLOCK  *b,                  |  0
 +;   BLOCKD *d)                  |  1
 +
 +global sym(vp9_fast_quantize_b_sse2)
 +sym(vp9_fast_quantize_b_sse2):
 +    push        rbp
 +    mov         rbp, rsp
 +    GET_GOT     rbx
 +
 +%if ABI_IS_32BIT
 +    push        rdi
 +    push        rsi
 +%else
 +  %ifidn __OUTPUT_FORMAT__,x64
 +    push        rdi
 +    push        rsi
 +  %else
 +    ; these registers are used for passing arguments
 +  %endif
 +%endif
 +
 +    ; end prolog
 +
 +%if ABI_IS_32BIT
 +    mov         rdi, arg(0)                 ; BLOCK *b
 +    mov         rsi, arg(1)                 ; BLOCKD *d
 +%else
 +  %ifidn __OUTPUT_FORMAT__,x64
 +    mov         rdi, rcx                    ; BLOCK *b
 +    mov         rsi, rdx                    ; BLOCKD *d
 +  %else
 +    ;mov         rdi, rdi                    ; BLOCK *b
 +    ;mov         rsi, rsi                    ; BLOCKD *d
 +  %endif
 +%endif
 +
 +    mov         rax, [rdi + vp9_block_coeff]
 +    mov         rcx, [rdi + vp9_block_round]
 +    mov         rdx, [rdi + vp9_block_quant_fast]
 +
 +    ; z = coeff
 +    movdqa      xmm0, [rax]
 +    movdqa      xmm4, [rax + 16]
 +
 +    ; dup z so we can save sz
 +    movdqa      xmm1, xmm0
 +    movdqa      xmm5, xmm4
 +
 +    ; sz = z >> 15
 +    psraw       xmm0, 15
 +    psraw       xmm4, 15
 +
 +    ; x = abs(z) = (z ^ sz) - sz
 +    pxor        xmm1, xmm0
 +    pxor        xmm5, xmm4
 +    psubw       xmm1, xmm0
 +    psubw       xmm5, xmm4
 +
 +    ; x += round
 +    paddw       xmm1, [rcx]
 +    paddw       xmm5, [rcx + 16]
 +
 +    mov         rax, [rsi + vp9_blockd_qcoeff]
 +    mov         rcx, [rsi + vp9_blockd_dequant]
 +    mov         rdi, [rsi + vp9_blockd_dqcoeff]
 +
 +    ; y = x * quant >> 16
 +    pmulhw      xmm1, [rdx]
 +    pmulhw      xmm5, [rdx + 16]
 +
 +    ; x = (y ^ sz) - sz
 +    pxor        xmm1, xmm0
 +    pxor        xmm5, xmm4
 +    psubw       xmm1, xmm0
 +    psubw       xmm5, xmm4
 +
 +    ; qcoeff = x
 +    movdqa      [rax], xmm1
 +    movdqa      [rax + 16], xmm5
 +
 +    ; x * dequant
 +    movdqa      xmm2, xmm1
 +    movdqa      xmm3, xmm5
 +    pmullw      xmm2, [rcx]
 +    pmullw      xmm3, [rcx + 16]
 +
 +    ; dqcoeff = x * dequant
 +    movdqa      [rdi], xmm2
 +    movdqa      [rdi + 16], xmm3
 +
 +    pxor        xmm4, xmm4                  ;clear all bits
 +    pcmpeqw     xmm1, xmm4
 +    pcmpeqw     xmm5, xmm4
 +
 +    pcmpeqw     xmm4, xmm4                  ;set all bits
 +    pxor        xmm1, xmm4
 +    pxor        xmm5, xmm4
 +
 +    pand        xmm1, [GLOBAL(inv_zig_zag)]
 +    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]
 +
 +    pmaxsw      xmm1, xmm5
 +
 +    ; now down to 8
 +    pshufd      xmm5, xmm1, 00001110b
 +
 +    pmaxsw      xmm1, xmm5
 +
 +    ; only 4 left
 +    pshuflw     xmm5, xmm1, 00001110b
 +
 +    pmaxsw      xmm1, xmm5
 +
 +    ; okay, just 2!
 +    pshuflw     xmm5, xmm1, 00000001b
 +
 +    pmaxsw      xmm1, xmm5
 +
 +    movd        eax, xmm1
 +    and         eax, 0xff
 +    mov         [rsi + vp9_blockd_eob], eax
 +
 +    ; begin epilog
 +%if ABI_IS_32BIT
 +    pop         rsi
 +    pop         rdi
 +%else
 +  %ifidn __OUTPUT_FORMAT__,x64
 +    pop         rsi
 +    pop         rdi
 +  %endif
 +%endif
 +
 +    RESTORE_GOT
 +    pop         rbp
 +    ret
 +
 +SECTION_RODATA
 +align 16
 +inv_zig_zag:
 +  dw 0x0001, 0x0002, 0x0006, 0x0007
 +  dw 0x0003, 0x0005, 0x0008, 0x000d
 +  dw 0x0004, 0x0009, 0x000c, 0x000e
 +  dw 0x000a, 0x000b, 0x000f, 0x0010
diff --cc vp9/encoder/x86/quantize_sse4.asm
index c7429ecd3,000000000..4245b947c
mode 100644,000000..100644
--- a/vp9/encoder/x86/quantize_sse4.asm
+++ b/vp9/encoder/x86/quantize_sse4.asm
@@@ -1,254 -1,0 +1,254 @@@
 +;
 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 +;
 +;  Use of this source code is governed by a BSD-style license and patent
 +;  grant that can be found in the LICENSE file in the root of the source
 +;  tree. All contributing project authors may be found in the AUTHORS
 +;  file in the root of the source tree.
 +;
 +
 +
 +%include "vpx_ports/x86_abi_support.asm"
- %include "asm_enc_offsets.asm"
++%include "vp9_asm_enc_offsets.asm"
 +
 +
 +; void vp9_regular_quantize_b_sse4 | arg
 +;  (BLOCK  *b,                     |  0
 +;   BLOCKD *d)                     |  1
 +
 +global sym(vp9_regular_quantize_b_sse4)
 +sym(vp9_regular_quantize_b_sse4):
 +
 +%if ABI_IS_32BIT
 +    push        rbp
 +    mov         rbp, rsp
 +    GET_GOT     rbx
 +    push        rdi
 +    push        rsi
 +
 +    ALIGN_STACK 16, rax
 +    %define qcoeff      0 ; 32
 +    %define stack_size 32
 +    sub         rsp, stack_size
 +%else
 +  %ifidn __OUTPUT_FORMAT__,x64
 +    SAVE_XMM 8, u
 +    push        rdi
 +    push        rsi
 +  %endif
 +%endif
 +    ; end prolog
 +
 +%if ABI_IS_32BIT
 +    mov         rdi, arg(0)                 ; BLOCK *b
 +    mov         rsi, arg(1)                 ; BLOCKD *d
 +%else
 +  %ifidn __OUTPUT_FORMAT__,x64
 +    mov         rdi, rcx                    ; BLOCK *b
 +    mov         rsi, rdx                    ; BLOCKD *d
 +  %else
 +    ;mov         rdi, rdi                    ; BLOCK *b
 +    ;mov         rsi, rsi                    ; BLOCKD *d
 +  %endif
 +%endif
 +
 +    mov         rax, [rdi + vp9_block_coeff]
 +    mov         rcx, [rdi + vp9_block_zbin]
 +    mov         rdx, [rdi + vp9_block_round]
 +    movd        xmm7, [rdi + vp9_block_zbin_extra]
 +
 +    ; z
 +    movdqa      xmm0, [rax]
 +    movdqa      xmm1, [rax + 16]
 +
 +    ; duplicate zbin_oq_value
 +    pshuflw     xmm7, xmm7, 0
 +    punpcklwd   xmm7, xmm7
 +
 +    movdqa      xmm2, xmm0
 +    movdqa      xmm3, xmm1
 +
 +    ; sz
 +    psraw       xmm0, 15
 +    psraw       xmm1, 15
 +
 +    ; (z ^ sz)
 +    pxor        xmm2, xmm0
 +    pxor        xmm3, xmm1
 +
 +    ; x = abs(z)
 +    psubw       xmm2, xmm0
 +    psubw       xmm3, xmm1
 +
 +    ; zbin
 +    movdqa      xmm4, [rcx]
 +    movdqa      xmm5, [rcx + 16]
 +
 +    ; *zbin_ptr + zbin_oq_value
 +    paddw       xmm4, xmm7
 +    paddw       xmm5, xmm7
 +
 +    movdqa      xmm6, xmm2
 +    movdqa      xmm7, xmm3
 +
 +    ; x - (*zbin_ptr + zbin_oq_value)
 +    psubw       xmm6, xmm4
 +    psubw       xmm7, xmm5
 +
 +    ; round
 +    movdqa      xmm4, [rdx]
 +    movdqa      xmm5, [rdx + 16]
 +
 +    mov         rax, [rdi + vp9_block_quant_shift]
 +    mov         rcx, [rdi + vp9_block_quant]
 +    mov         rdx, [rdi + vp9_block_zrun_zbin_boost]
 +
 +    ; x + round
 +    paddw       xmm2, xmm4
 +    paddw       xmm3, xmm5
 +
 +    ; quant
 +    movdqa      xmm4, [rcx]
 +    movdqa      xmm5, [rcx + 16]
 +
 +    ; y = x * quant_ptr >> 16
 +    pmulhw      xmm4, xmm2
 +    pmulhw      xmm5, xmm3
 +
 +    ; y += x
 +    paddw       xmm2, xmm4
 +    paddw       xmm3, xmm5
 +
 +    pxor        xmm4, xmm4
 +%if ABI_IS_32BIT
 +    movdqa      [rsp + qcoeff], xmm4
 +    movdqa      [rsp + qcoeff + 16], xmm4
 +%else
 +    pxor        xmm8, xmm8
 +%endif
 +
 +    ; quant_shift
 +    movdqa      xmm5, [rax]
 +
 +    ; zrun_zbin_boost
 +    mov         rax, rdx
 +
 +%macro ZIGZAG_LOOP 5
 +    ; x
 +    pextrw      ecx, %4, %2
 +
 +    ; if (x >= zbin)
 +    sub         cx, WORD PTR[rdx]           ; x - zbin
 +    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
 +    jl          .rq_zigzag_loop_%1          ; x < zbin
 +
 +    pextrw      edi, %3, %2                 ; y
 +
 +    ; downshift by quant_shift[rc]
 +    pextrb      ecx, xmm5, %1               ; quant_shift[rc]
 +    sar         edi, cl                     ; also sets Z bit
 +    je          .rq_zigzag_loop_%1          ; !y
 +%if ABI_IS_32BIT
 +    mov         WORD PTR[rsp + qcoeff + %1 *2], di
 +%else
 +    pinsrw      %5, edi, %2                 ; qcoeff[rc]
 +%endif
 +    mov         rdx, rax                    ; reset to b->zrun_zbin_boost
 +.rq_zigzag_loop_%1:
 +%endmacro
 +; in vp9_default_zig_zag1d order: see vp9/common/entropy.c
 +ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
 +ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
 +ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
 +ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
 +ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
 +ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
 +ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
 +ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
 +ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
 +ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
 +ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
 +ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
 +ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
 +ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
 +ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
 +ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
 +
 +    mov         rcx, [rsi + vp9_blockd_dequant]
 +    mov         rdi, [rsi + vp9_blockd_dqcoeff]
 +
 +%if ABI_IS_32BIT
 +    movdqa      xmm4, [rsp + qcoeff]
 +    movdqa      xmm5, [rsp + qcoeff + 16]
 +%else
 +    %define     xmm5 xmm8
 +%endif
 +
 +    ; y ^ sz
 +    pxor        xmm4, xmm0
 +    pxor        xmm5, xmm1
 +    ; x = (y ^ sz) - sz
 +    psubw       xmm4, xmm0
 +    psubw       xmm5, xmm1
 +
 +    ; dequant
 +    movdqa      xmm0, [rcx]
 +    movdqa      xmm1, [rcx + 16]
 +
 +    mov         rcx, [rsi + vp9_blockd_qcoeff]
 +
 +    pmullw      xmm0, xmm4
 +    pmullw      xmm1, xmm5
 +
 +    ; store qcoeff
 +    movdqa      [rcx], xmm4
 +    movdqa      [rcx + 16], xmm5
 +
 +    ; store dqcoeff
 +    movdqa      [rdi], xmm0
 +    movdqa      [rdi + 16], xmm1
 +
 +    ; select the last value (in zig_zag order) for EOB
 +    pxor        xmm6, xmm6
 +    pcmpeqw     xmm4, xmm6
 +    pcmpeqw     xmm5, xmm6
 +
 +    packsswb    xmm4, xmm5
 +    pshufb      xmm4, [GLOBAL(zig_zag1d)]
 +    pmovmskb    edx, xmm4
 +    xor         rdi, rdi
 +    mov         eax, -1
 +    xor         dx, ax
 +    bsr         eax, edx
 +    sub         edi, edx
 +    sar         edi, 31
 +    add         eax, 1
 +    and         eax, edi
 +
 +    mov         [rsi + vp9_blockd_eob], eax
 +
 +    ; begin epilog
 +%if ABI_IS_32BIT
 +    add         rsp, stack_size
 +    pop         rsp
 +
 +    pop         rsi
 +    pop         rdi
 +    RESTORE_GOT
 +    pop         rbp
 +%else
 +  %undef xmm5
 +  %ifidn __OUTPUT_FORMAT__,x64
 +    pop         rsi
 +    pop         rdi
 +    RESTORE_XMM
 +  %endif
 +%endif
 +
 +    ret
 +
 +SECTION_RODATA
 +align 16
 +; vp9/common/entropy.c: vp9_default_zig_zag1d
 +zig_zag1d:
 +    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --cc vp9/encoder/x86/quantize_ssse3.asm
index 14a9912d2,000000000..8c464287a
mode 100644,000000..100644
--- a/vp9/encoder/x86/quantize_ssse3.asm
+++ b/vp9/encoder/x86/quantize_ssse3.asm
@@@ -1,138 -1,0 +1,138 @@@
 +;
 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 +;
 +;  Use of this source code is governed by a BSD-style license and patent
 +;  grant that can be found in the LICENSE file in the root of the source
 +;  tree. All contributing project authors may be found in the AUTHORS
 +;  file in the root of the source tree.
 +;
 +
 +
 +%include "vpx_ports/x86_abi_support.asm"
- %include "asm_enc_offsets.asm"
++%include "vp9_asm_enc_offsets.asm"
 +
 +
 +; void vp9_fast_quantize_b_ssse3 | arg
 +;  (BLOCK  *b,                   |  0
 +;   BLOCKD *d)                   |  1
 +;
 +
 +global sym(vp9_fast_quantize_b_ssse3)
 +sym(vp9_fast_quantize_b_ssse3):
 +    push        rbp
 +    mov         rbp, rsp
 +    GET_GOT     rbx
 +
 +%if ABI_IS_32BIT
 +    push        rdi
 +    push        rsi
 +%else
 +  %ifidn __OUTPUT_FORMAT__,x64
 +    push        rdi
 +    push        rsi
 +  %endif
 +%endif
 +    ; end prolog
 +
 +%if ABI_IS_32BIT
 +    mov         rdi, arg(0)                 ; BLOCK *b
 +    mov         rsi, arg(1)                 ; BLOCKD *d
 +%else
 +  %ifidn __OUTPUT_FORMAT__,x64
 +    mov         rdi, rcx                    ; BLOCK *b
 +    mov         rsi, rdx                    ; BLOCKD *d
 +  %else
 +    ;mov         rdi, rdi                    ; BLOCK *b
 +    ;mov         rsi, rsi                    ; BLOCKD *d
 +  %endif
 +%endif
 +
 +    mov         rax, [rdi + vp9_block_coeff]
 +    mov         rcx, [rdi + vp9_block_round]
 +    mov         rdx, [rdi + vp9_block_quant_fast]
 +
 +    ; coeff
 +    movdqa      xmm0, [rax]
 +    movdqa      xmm4, [rax + 16]
 +
 +    ; round
 +    movdqa      xmm2, [rcx]
 +    movdqa      xmm3, [rcx + 16]
 +
 +    movdqa      xmm1, xmm0
 +    movdqa      xmm5, xmm4
 +
 +    ; sz = z >> 15
 +    psraw       xmm0, 15
 +    psraw       xmm4, 15
 +
 +    pabsw       xmm1, xmm1
 +    pabsw       xmm5, xmm5
 +
 +    paddw       xmm1, xmm2
 +    paddw       xmm5, xmm3
 +
 +    ; quant_fast
 +    pmulhw      xmm1, [rdx]
 +    pmulhw      xmm5, [rdx + 16]
 +
 +    mov         rax, [rsi + vp9_blockd_qcoeff]
 +    mov         rdi, [rsi + vp9_blockd_dequant]
 +    mov         rcx, [rsi + vp9_blockd_dqcoeff]
 +
 +    pxor        xmm1, xmm0
 +    pxor        xmm5, xmm4
 +    psubw       xmm1, xmm0
 +    psubw       xmm5, xmm4
 +
 +    movdqa      [rax], xmm1
 +    movdqa      [rax + 16], xmm5
 +
 +    movdqa      xmm2, [rdi]
 +    movdqa      xmm3, [rdi + 16]
 +
 +    pxor        xmm4, xmm4
 +    pmullw      xmm2, xmm1
 +    pmullw      xmm3, xmm5
 +
 +    pcmpeqw     xmm1, xmm4                  ;non zero mask
 +    pcmpeqw     xmm5, xmm4                  ;non zero mask
 +    packsswb    xmm1, xmm5
 +    pshufb      xmm1, [GLOBAL(zz_shuf)]
 +
 +    pmovmskb    edx, xmm1
 +
 +    xor         rdi, rdi
 +    mov         eax, -1
 +    xor         dx, ax                      ;flip the bits for bsr
 +    bsr         eax, edx
 +
 +    movdqa      [rcx], xmm2                 ;store dqcoeff
 +    movdqa      [rcx + 16], xmm3            ;store dqcoeff
 +
 +    sub         edi, edx                    ;check for all zeros in bit mask
 +    sar         edi, 31                     ;0 or -1
 +    add         eax, 1
 +    and         eax, edi                    ;if the bit mask was all zero,
 +                                            ;then eob = 0
 +    mov         [rsi + vp9_blockd_eob], eax
 +
 +    ; begin epilog
 +%if ABI_IS_32BIT
 +    pop         rsi
 +    pop         rdi
 +%else
 +  %ifidn __OUTPUT_FORMAT__,x64
 +    pop         rsi
 +    pop         rdi
 +  %endif
 +%endif
 +
 +    RESTORE_GOT
 +    pop         rbp
 +    ret
 +
 +SECTION_RODATA
 +align 16
 +zz_shuf:
 +    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --cc vp9/vp9_common.mk
index c5237d846,000000000..82e1e1053
mode 100644,000000..100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@@ -1,179 -1,0 +1,183 @@@
 +##
 +##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 +##
 +##  Use of this source code is governed by a BSD-style license
 +##  that can be found in the LICENSE file in the root of the source
 +##  tree. An additional intellectual property rights grant can be found
 +##  in the file PATENTS.  All contributing project authors may
 +##  be found in the AUTHORS file in the root of the source tree.
 +##
 +
 +VP9_COMMON_SRCS-yes += vp9_common.mk
 +VP9_COMMON_SRCS-yes += common/type_aliases.h
 +VP9_COMMON_SRCS-yes += common/pragmas.h
 +VP9_COMMON_SRCS-yes += common/ppflags.h
 +VP9_COMMON_SRCS-yes += common/onyx.h
 +VP9_COMMON_SRCS-yes += common/onyxd.h
 +VP9_COMMON_SRCS-yes += common/alloccommon.c
 +VP9_COMMON_SRCS-yes += common/asm_com_offsets.c
 +VP9_COMMON_SRCS-yes += common/blockd.c
 +VP9_COMMON_SRCS-yes += common/coefupdateprobs.h
 +VP9_COMMON_SRCS-yes += common/debugmodes.c
 +VP9_COMMON_SRCS-yes += common/entropy.c
 +VP9_COMMON_SRCS-yes += common/entropymode.c
 +VP9_COMMON_SRCS-yes += common/entropymv.c
 +VP9_COMMON_SRCS-yes += common/extend.c
 +VP9_COMMON_SRCS-yes += common/filter.c
 +VP9_COMMON_SRCS-yes += common/filter.h
 +VP9_COMMON_SRCS-yes += common/findnearmv.c
 +VP9_COMMON_SRCS-yes += common/generic/systemdependent.c
 +VP9_COMMON_SRCS-yes += common/idctllm.c
 +VP9_COMMON_SRCS-yes += common/alloccommon.h
 +VP9_COMMON_SRCS-yes += common/blockd.h
 +VP9_COMMON_SRCS-yes += common/common.h
 +VP9_COMMON_SRCS-yes += common/common_types.h
 +VP9_COMMON_SRCS-yes += common/entropy.h
 +VP9_COMMON_SRCS-yes += common/entropymode.h
 +VP9_COMMON_SRCS-yes += common/entropymv.h
 +VP9_COMMON_SRCS-yes += common/extend.h
 +VP9_COMMON_SRCS-yes += common/findnearmv.h
 +VP9_COMMON_SRCS-yes += common/header.h
 +VP9_COMMON_SRCS-yes += common/idct.h
 +VP9_COMMON_SRCS-yes += common/invtrans.h
 +VP9_COMMON_SRCS-yes += common/loopfilter.h
 +VP9_COMMON_SRCS-yes += common/modecont.h
 +VP9_COMMON_SRCS-yes += common/mv.h
 +VP9_COMMON_SRCS-yes += common/onyxc_int.h
 +VP9_COMMON_SRCS-yes += common/pred_common.h
 +VP9_COMMON_SRCS-yes += common/pred_common.c
 +VP9_COMMON_SRCS-yes += common/quant_common.h
 +VP9_COMMON_SRCS-yes += common/reconinter.h
 +VP9_COMMON_SRCS-yes += common/reconintra.h
 +VP9_COMMON_SRCS-yes += common/reconintra4x4.h
 +VP9_COMMON_SRCS-yes += common/rtcd.c
 +VP9_COMMON_SRCS-yes += common/rtcd_defs.sh
 +VP9_COMMON_SRCS-yes += common/sadmxn.h
 +VP9_COMMON_SRCS-yes += common/seg_common.h
 +VP9_COMMON_SRCS-yes += common/seg_common.c
 +VP9_COMMON_SRCS-yes += common/setupintrarecon.h
 +VP9_COMMON_SRCS-yes += common/subpixel.h
 +VP9_COMMON_SRCS-yes += common/swapyv12buffer.h
 +VP9_COMMON_SRCS-yes += common/systemdependent.h
 +VP9_COMMON_SRCS-yes += common/treecoder.h
 +VP9_COMMON_SRCS-yes += common/invtrans.c
 +VP9_COMMON_SRCS-yes += common/loopfilter.c
 +VP9_COMMON_SRCS-yes += common/loopfilter_filters.c
 +VP9_COMMON_SRCS-yes += common/mbpitch.c
 +VP9_COMMON_SRCS-yes += common/modecont.c
 +VP9_COMMON_SRCS-yes += common/modecontext.c
 +VP9_COMMON_SRCS-yes += common/mvref_common.c
 +VP9_COMMON_SRCS-yes += common/mvref_common.h
 +VP9_COMMON_SRCS-yes += common/quant_common.c
 +VP9_COMMON_SRCS-yes += common/recon.c
 +VP9_COMMON_SRCS-yes += common/reconinter.c
 +VP9_COMMON_SRCS-yes += common/reconintra.c
 +VP9_COMMON_SRCS-yes += common/reconintra4x4.c
 +VP9_COMMON_SRCS-yes += common/setupintrarecon.c
 +VP9_COMMON_SRCS-yes += common/swapyv12buffer.c
 +VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
 +VP9_COMMON_SRCS-yes += common/treecoder.c
 +VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/implicit_segmentation.c
 +
 +VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h
 +VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/subpixel_x86.h
 +VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.h
 +VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.h
 +VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/x86_systemdependent.c
 +VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
 +VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
 +VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
 +VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
 +VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
 +VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
 +VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
 +VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
 +VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
 +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
 +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
 +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
 +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
 +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
 +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
 +VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_8t_ssse3.asm
 +VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
 +ifeq ($(CONFIG_POSTPROC),yes)
 +VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
 +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
 +endif
 +
 +# common (c)
 +ifeq ($(CONFIG_CSM),yes)
 +VP9_COMMON_SRCS-yes += common/maskingmv.c
 +VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/mask_sse3.asm
 +endif
 +
 +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filter_sse4.c
 +ifeq ($(HAVE_SSE4_1),yes)
 +vp9/common/x86/filter_sse4.c.o: CFLAGS += -msse4
 +endif
 +
 +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/filter_sse2.c
 +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sadmxn_x86.c
 +ifeq ($(HAVE_SSE2),yes)
 +vp9/common/x86/filter_sse2.c.o: CFLAGS += -msse2
 +vp9/common/x86/loopfilter_x86.c.o: CFLAGS += -msse2
 +vp9/common/x86/sadmxn_x86.c.o: CFLAGS += -msse2
 +endif
 +
 +VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/arm_systemdependent.c
 +VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.c
 +VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.h
 +VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c
 +VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/idct_arm.h
 +VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c
 +VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.h
 +VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/recon_arm.h
 +VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/reconintra_arm.c
 +VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/subpixel_arm.h
 +
 +# common (armv6)
 +VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/bilinearfilter_v6$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem8x4_v6$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem8x8_v6$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem16x16_v6$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/dc_only_idct_add_v6$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/iwalsh_v6$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/filter_v6$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/idct_v6$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/loopfilter_v6$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/recon_v6$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/simpleloopfilter_v6$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/sixtappredict8x4_v6$(ASM)
 +
 +# common (neon)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict4x4_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict8x4_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict8x8_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict16x16_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem8x4_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem8x8_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem16x16_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/dc_only_idct_add_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/iwalsh_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfilter_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/mbloopfilter_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon2b_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon4b_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/reconb_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/shortidct4x4llm_1_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/shortidct4x4llm_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict4x4_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict8x4_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict8x8_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict16x16_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon16x16mb_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/save_neon_reg$(ASM)
 +VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon_neon.c
++
++
++$(eval $(call asm_offsets_template,\
++         vp9_asm_com_offsets.asm, $(VP9_PREFIX)common/asm_com_offsets.c))
diff --cc vp9/vp9cx.mk
index ebb7a575d,000000000..6e157b0c4
mode 100644,000000..100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@@ -1,120 -1,0 +1,120 @@@
 +##
 +##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 +##
 +##  Use of this source code is governed by a BSD-style license
 +##  that can be found in the LICENSE file in the root of the source
 +##  tree. An additional intellectual property rights grant can be found
 +##  in the file PATENTS.  All contributing project authors may
 +##  be found in the AUTHORS file in the root of the source tree.
 +##
 +
- 
- include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk
- 
 +VP9_CX_EXPORTS += exports_enc
 +
 +VP9_CX_SRCS-yes += $(VP9_COMMON_SRCS-yes)
 +VP9_CX_SRCS-no  += $(VP9_COMMON_SRCS-no)
 +VP9_CX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes)
 +VP9_CX_SRCS_REMOVE-no  += $(VP9_COMMON_SRCS_REMOVE-no)
 +
 +ifeq ($(ARCH_ARM),yes)
 +  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9cx_arm.mk
 +endif
 +
 +VP9_CX_SRCS-yes += vp9_cx_iface.c
 +
 +# encoder
 +#INCLUDES += algo/vpx_common/vpx_mem/include
 +#INCLUDES += common
 +#INCLUDES += common
 +#INCLUDES += common
 +#INCLUDES += algo/vpx_ref/cpu_id/include
 +#INCLUDES += common
 +#INCLUDES += encoder
 +
 +VP9_CX_SRCS-yes += encoder/asm_enc_offsets.c
 +VP9_CX_SRCS-yes += encoder/bitstream.c
 +VP9_CX_SRCS-yes += encoder/boolhuff.c
 +VP9_CX_SRCS-yes += encoder/dct.c
 +VP9_CX_SRCS-yes += encoder/encodeframe.c
 +VP9_CX_SRCS-yes += encoder/encodeintra.c
 +VP9_CX_SRCS-yes += encoder/encodemb.c
 +VP9_CX_SRCS-yes += encoder/encodemv.c
 +VP9_CX_SRCS-yes += encoder/firstpass.c
 +VP9_CX_SRCS-yes += encoder/generic/csystemdependent.c
 +VP9_CX_SRCS-yes += encoder/block.h
 +VP9_CX_SRCS-yes += encoder/boolhuff.h
 +VP9_CX_SRCS-yes += encoder/bitstream.h
 +VP9_CX_SRCS-yes += encoder/encodeintra.h
 +VP9_CX_SRCS-yes += encoder/encodemb.h
 +VP9_CX_SRCS-yes += encoder/encodemv.h
 +VP9_CX_SRCS-yes += encoder/firstpass.h
 +VP9_CX_SRCS-yes += encoder/lookahead.c
 +VP9_CX_SRCS-yes += encoder/lookahead.h
 +VP9_CX_SRCS-yes += encoder/mcomp.h
 +VP9_CX_SRCS-yes += encoder/modecosts.h
 +VP9_CX_SRCS-yes += encoder/onyx_int.h
 +VP9_CX_SRCS-yes += encoder/psnr.h
 +VP9_CX_SRCS-yes += encoder/quantize.h
 +VP9_CX_SRCS-yes += encoder/ratectrl.h
 +VP9_CX_SRCS-yes += encoder/rdopt.h
 +VP9_CX_SRCS-yes += encoder/tokenize.h
 +VP9_CX_SRCS-yes += encoder/treewriter.h
 +VP9_CX_SRCS-yes += encoder/variance.h
 +VP9_CX_SRCS-yes += encoder/mcomp.c
 +VP9_CX_SRCS-yes += encoder/modecosts.c
 +VP9_CX_SRCS-yes += encoder/onyx_if.c
 +VP9_CX_SRCS-yes += encoder/picklpf.c
 +VP9_CX_SRCS-yes += encoder/psnr.c
 +VP9_CX_SRCS-yes += encoder/quantize.c
 +VP9_CX_SRCS-yes += encoder/ratectrl.c
 +VP9_CX_SRCS-yes += encoder/rdopt.c
 +VP9_CX_SRCS-yes += encoder/sad_c.c
 +VP9_CX_SRCS-yes += encoder/satd_c.c
 +VP9_CX_SRCS-yes += encoder/segmentation.c
 +VP9_CX_SRCS-yes += encoder/segmentation.h
 +VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/ssim.c
 +VP9_CX_SRCS-yes += encoder/tokenize.c
 +VP9_CX_SRCS-yes += encoder/treewriter.c
 +VP9_CX_SRCS-yes += encoder/variance_c.c
 +ifeq ($(CONFIG_POSTPROC),yes)
 +VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
 +VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
 +endif
 +VP9_CX_SRCS-yes += encoder/temporal_filter.c
 +VP9_CX_SRCS-yes += encoder/temporal_filter.h
 +VP9_CX_SRCS-yes += encoder/mbgraph.c
 +VP9_CX_SRCS-yes += encoder/mbgraph.h
 +
 +
 +VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/mcomp_x86.h
 +VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_x86.h
 +VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/temporal_filter_x86.h
 +VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/x86_csystemdependent.c
 +VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_mmx.c
 +VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm
 +VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/sad_mmx.asm
 +VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
 +VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
 +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_sse2.c
 +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm
 +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm
 +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
 +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 +VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
 +VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
 +VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c
 +VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm
 +VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
 +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
 +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
 +VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 +VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
 +VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm
 +
 +
 +VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
++
++$(eval $(call asm_offsets_template,\
++         vp9_asm_enc_offsets.asm, $(VP9_PREFIX)encoder/asm_enc_offsets.c))
diff --cc vp9/vp9dx.mk
index dc9f2d390,000000000..75f6bfd6e
mode 100644,000000..100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@@ -1,71 -1,0 +1,71 @@@
 +##
 +##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 +##
 +##  Use of this source code is governed by a BSD-style license
 +##  that can be found in the LICENSE file in the root of the source
 +##  tree. An additional intellectual property rights grant can be found
 +##  in the file PATENTS.  All contributing project authors may
 +##  be found in the AUTHORS file in the root of the source tree.
 +##
 +
- 
- include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk
- 
 +VP9_DX_EXPORTS += exports_dec
 +
 +VP9_DX_SRCS-yes += $(VP9_COMMON_SRCS-yes)
 +VP9_DX_SRCS-no  += $(VP9_COMMON_SRCS-no)
 +VP9_DX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes)
 +VP9_DX_SRCS_REMOVE-no  += $(VP9_COMMON_SRCS_REMOVE-no)
 +
 +ifeq ($(ARCH_ARM),yes)
 +  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9dx_arm.mk
 +endif
 +
 +VP9_DX_SRCS-yes += vp9_dx_iface.c
 +
 +# common
 +#define ARM
 +#define DISABLE_THREAD
 +
 +#INCLUDES += algo/vpx_common/vpx_mem/include
 +#INCLUDES += common
 +#INCLUDES += common
 +#INCLUDES += common
 +#INCLUDES += common
 +#INCLUDES += decoder
 +
 +
 +
 +# decoder
 +#define ARM
 +#define DISABLE_THREAD
 +
 +#INCLUDES += algo/vpx_common/vpx_mem/include
 +#INCLUDES += common
 +#INCLUDES += common
 +#INCLUDES += common
 +#INCLUDES += common
 +#INCLUDES += decoder
 +
 +VP9_DX_SRCS-yes += decoder/asm_dec_offsets.c
 +VP9_DX_SRCS-yes += decoder/dboolhuff.c
 +VP9_DX_SRCS-yes += decoder/decodemv.c
 +VP9_DX_SRCS-yes += decoder/decodframe.c
 +VP9_DX_SRCS-yes += decoder/dequantize.c
 +VP9_DX_SRCS-yes += decoder/detokenize.c
 +VP9_DX_SRCS-yes += decoder/dboolhuff.h
 +VP9_DX_SRCS-yes += decoder/decodemv.h
 +VP9_DX_SRCS-yes += decoder/dequantize.h
 +VP9_DX_SRCS-yes += decoder/detokenize.h
 +VP9_DX_SRCS-yes += decoder/onyxd_int.h
 +VP9_DX_SRCS-yes += decoder/treereader.h
 +VP9_DX_SRCS-yes += decoder/onyxd_if.c
 +VP9_DX_SRCS-yes += decoder/idct_blk.c
 +
 +VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
 +
 +VP9_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/x86_dsystemdependent.c
 +VP9_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm
 +VP9_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c
 +VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c
++
++$(eval $(call asm_offsets_template,\
++         vp9_asm_dec_offsets.asm, $(VP9_PREFIX)decoder/asm_dec_offsets.c))
diff --cc vpx_ports/arm_cpudetect.c
index f36d46d51,8ff95a110..b23344858
--- a/vpx_ports/arm_cpudetect.c
+++ b/vpx_ports/arm_cpudetect.c
@@@ -35,24 -39,25 +35,26 @@@ int arm_cpu_caps(void) 
    /* This function should actually be a no-op. There is no way to adjust any of
     * these because the RTCD tables do not exist: the functions are called
     * statically */
 -    int flags;
 -    int mask;
 -    if (!arm_cpu_env_flags(&flags))
 -    {
 -        return flags;
 -    }
 -    mask = arm_cpu_env_mask();
 +  int flags;
 +  int mask;
 +  if (!arm_cpu_env_flags(&flags)) {
 +    return flags;
 +  }
 +  mask = arm_cpu_env_mask();
  #if HAVE_EDSP
 -    flags |= HAS_EDSP;
 +  flags |= HAS_EDSP;
  #endif /* HAVE_EDSP */
  #if HAVE_MEDIA
 -    flags |= HAS_MEDIA;
 +  flags |= HAS_MEDIA;
  #endif /* HAVE_MEDIA */
  #if HAVE_NEON
 -    flags |= HAS_NEON;
 +  flags |= HAS_NEON;
  #endif /* HAVE_NEON */
 -    return flags & mask;
 +  return flags & mask;
  }
  
++#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
++
  #elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
  /*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
  #define WIN32_LEAN_AND_MEAN
@@@ -109,29 -127,31 +111,31 @@@ return flags & mask
  #elif defined(__ANDROID__) /* end _MSC_VER */
  #include <cpu-features.h>
  
 -int arm_cpu_caps(void)
 -{
 -    int flags;
 -    int mask;
 -    uint64_t features;
 -    if (!arm_cpu_env_flags(&flags))
 -    {
 -        return flags;
 -    }
 -    mask = arm_cpu_env_mask();
 -    features = android_getCpuFeatures();
 +int arm_cpu_caps(void) {
 +  int flags;
 +  int mask;
 +  uint64_t features;
 +  if (!arm_cpu_env_flags(&flags)) {
 +    return flags;
 +  }
 +  mask = arm_cpu_env_mask();
 +  features = android_getCpuFeatures();
  
  #if HAVE_EDSP
 -    flags |= HAS_EDSP;
 +  flags |= HAS_EDSP;
  #endif /* HAVE_EDSP */
  #if HAVE_MEDIA
 -    flags |= HAS_MEDIA;
 +  flags |= HAS_MEDIA;
  #endif /* HAVE_MEDIA */
  #if HAVE_NEON
 -    if (features & ANDROID_CPU_ARM_FEATURE_NEON)
 -        flags |= HAS_NEON;
 +  if (features & ANDROID_CPU_ARM_FEATURE_NEON)
 +    flags |= HAS_NEON;
  #endif /* HAVE_NEON */
 -    return flags & mask;
 +  return flags & mask;
  }
  
++#elif defined(__linux__) /* end __ANDROID__ */
++
  #elif defined(__linux__) /* end __ANDROID__ */
  #include <stdio.h>
  
diff --cc vpx_ports/x86.h
index f88377290,9dd8c4b59..f1cf6265e
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@@ -185,25 -188,29 +185,38 @@@ x86_readtsc(void) 
  
  #if defined(__GNUC__) && __GNUC__
  static void
 -x87_set_control_word(unsigned short mode)
 -{
 -    __asm__ __volatile__("fldcw %0" : : "m"(*&mode));
 +x87_set_control_word(unsigned short mode) {
 +  __asm__ __volatile__("fldcw %0" : : "m"( *&mode));
  }
  static unsigned short
 -x87_get_control_word(void)
 -{
 -    unsigned short mode;
 -    __asm__ __volatile__("fstcw %0\n\t":"=m"(*&mode):);
 +x87_get_control_word(void) {
 +  unsigned short mode;
 +  __asm__ __volatile__("fstcw %0\n\t":"=m"( *&mode):);
+     return mode;
+ }
+ #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+ static void
+ x87_set_control_word(unsigned short mode)
+ {
+     asm volatile("fldcw %0" : : "m"(*&mode));
+ }
+ static unsigned short
+ x87_get_control_word(void)
+ {
+     unsigned short mode;
+     asm volatile("fstcw %0\n\t":"=m"(*&mode):);
 -    return mode;
 +  return mode;
 +}
 +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
 +static void
 +x87_set_control_word(unsigned short mode) {
 +  asm volatile("fldcw %0" : : "m"( *&mode));
 +}
 +static unsigned short
 +x87_get_control_word(void) {
 +  unsigned short mode;
 +  asm volatile("fstcw %0\n\t":"=m"( *&mode):);
 +  return mode;
  }
  #elif ARCH_X86_64
  /* No fldcw intrinsics on Windows x64, punt to external asm */
diff --cc vpxdec.c
index 4e598298f,9b728bf82..44a80e3df
--- a/vpxdec.c
+++ b/vpxdec.c
@@@ -22,7 -22,7 +22,7 @@@
  #include "vpx_config.h"
  #include "vpx/vpx_decoder.h"
  #include "vpx_ports/vpx_timer.h"
- #if CONFIG_VP9_DECODER
 -#if CONFIG_VP8_DECODER
++#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
  #include "vpx/vp8dx.h"
  #endif
  #if CONFIG_MD5
@@@ -49,14 -49,16 +49,17 @@@
  static const char *exec_name;
  
  #define VP8_FOURCC (0x00385056)
 -static const struct
 -{
 -    char const *name;
 -    vpx_codec_iface_t *iface;
 -    unsigned int             fourcc;
 -    unsigned int             fourcc_mask;
 -} ifaces[] =
 -{
 +static const struct {
 +  char const *name;
 +  const vpx_codec_iface_t *(*iface)(void);
 +  unsigned int             fourcc;
 +  unsigned int             fourcc_mask;
 +} ifaces[] = {
+ #if CONFIG_VP8_DECODER
 -    {"vp8",  &vpx_codec_vp8_dx_algo,   VP8_FOURCC, 0x00FFFFFF},
++  {"vp8",  vpx_codec_vp8_dx,   VP8_FOURCC, 0x00FFFFFF},
++#endif
 +#if CONFIG_VP9_DECODER
 +  {"vp9",  vpx_codec_vp8_dx,   VP8_FOURCC, 0x00FFFFFF},
  #endif
  };
  
@@@ -95,121 -95,129 +98,121 @@@ static const arg_def_t error_concealmen
  static const arg_def_t md5arg = ARG_DEF(NULL, "md5", 0,
                                          "Compute the MD5 sum of the decoded frame");
  #endif
 -static const arg_def_t *all_args[] =
 -{
 -    &codecarg, &use_yv12, &use_i420, &flipuvarg, &noblitarg,
 -    &progressarg, &limitarg, &postprocarg, &summaryarg, &outputfile,
 -    &threadsarg, &verbosearg,
 +static const arg_def_t *all_args[] = {
 +  &codecarg, &use_yv12, &use_i420, &flipuvarg, &noblitarg,
 +  &progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile,
 +  &threadsarg, &verbosearg,
  #if CONFIG_MD5
 -    &md5arg,
 +  &md5arg,
  #endif
 -    &error_concealment,
 -    NULL
 +  &error_concealment,
 +  NULL
  };
  
- #if CONFIG_VP9_DECODER
 -#if CONFIG_VP8_DECODER
++#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
  static const arg_def_t addnoise_level = ARG_DEF(NULL, "noise-level", 1,
 -                                        "Enable VP8 postproc add noise");
 +                                                "Enable VP8 postproc add noise");
  static const arg_def_t deblock = ARG_DEF(NULL, "deblock", 0,
 -                                 "Enable VP8 deblocking");
 +                                         "Enable VP8 deblocking");
  static const arg_def_t demacroblock_level = ARG_DEF(NULL, "demacroblock-level", 1,
 -        "Enable VP8 demacroblocking, w/ level");
 +                                                    "Enable VP8 demacroblocking, w/ level");
  static const arg_def_t pp_debug_info = ARG_DEF(NULL, "pp-debug-info", 1,
 -                                       "Enable VP8 visible debug info");
 +                                               "Enable VP8 visible debug info");
  static const arg_def_t pp_disp_ref_frame = ARG_DEF(NULL, "pp-dbg-ref-frame", 1,
 -                                       "Display only selected reference frame per macro block");
 +                                                   "Display only selected reference frame per macro block");
  static const arg_def_t pp_disp_mb_modes = ARG_DEF(NULL, "pp-dbg-mb-modes", 1,
 -                                       "Display only selected macro block modes");
 +                                                  "Display only selected macro block modes");
  static const arg_def_t pp_disp_b_modes = ARG_DEF(NULL, "pp-dbg-b-modes", 1,
 -                                       "Display only selected block modes");
 +                                                 "Display only selected block modes");
  static const arg_def_t pp_disp_mvs = ARG_DEF(NULL, "pp-dbg-mvs", 1,
 -                                       "Draw only selected motion vectors");
 +                                             "Draw only selected motion vectors");
  static const arg_def_t mfqe = ARG_DEF(NULL, "mfqe", 0,
 -                                       "Enable multiframe quality enhancement");
 +                                      "Enable multiframe quality enhancement");
  
 -static const arg_def_t *vp8_pp_args[] =
 -{
 -    &addnoise_level, &deblock, &demacroblock_level, &pp_debug_info,
 -    &pp_disp_ref_frame, &pp_disp_mb_modes, &pp_disp_b_modes, &pp_disp_mvs, &mfqe,
 -    NULL
 +static const arg_def_t *vp8_pp_args[] = {
 +  &addnoise_level, &deblock, &demacroblock_level, &pp_debug_info,
 +  &pp_disp_ref_frame, &pp_disp_mb_modes, &pp_disp_b_modes, &pp_disp_mvs, &mfqe,
 +  NULL
  };
  #endif
  
 -static void usage_exit()
 -{
 -    int i;
 +static void usage_exit() {
 +  int i;
  
 -    fprintf(stderr, "Usage: %s <options> filename\n\n"
 -            "Options:\n", exec_name);
 -    arg_show_usage(stderr, all_args);
 -#if CONFIG_VP8_DECODER
 -    fprintf(stderr, "\nVP8 Postprocessing Options:\n");
 -    arg_show_usage(stderr, vp8_pp_args);
 +  fprintf(stderr, "Usage: %s <options> filename\n\n"
 +          "Options:\n", exec_name);
 +  arg_show_usage(stderr, all_args);
- #if CONFIG_VP9_DECODER
++#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
 +  fprintf(stderr, "\nVP8 Postprocessing Options:\n");
 +  arg_show_usage(stderr, vp8_pp_args);
  #endif
 -    fprintf(stderr,
 -            "\nOutput File Patterns:\n\n"
 -            "  The -o argument specifies the name of the file(s) to "
 -            "write to. If the\n  argument does not include any escape "
 -            "characters, the output will be\n  written to a single file. "
 -            "Otherwise, the filename will be calculated by\n  expanding "
 -            "the following escape characters:\n");
 -    fprintf(stderr,
 -            "\n\t%%w   - Frame width"
 -            "\n\t%%h   - Frame height"
 -            "\n\t%%<n> - Frame number, zero padded to <n> places (1..9)"
 -            "\n\n  Pattern arguments are only supported in conjunction "
 -            "with the --yv12 and\n  --i420 options. If the -o option is "
 -            "not specified, the output will be\n  directed to stdout.\n"
 -            );
 -    fprintf(stderr, "\nIncluded decoders:\n\n");
 -
 -    for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
 -        fprintf(stderr, "    %-6s - %s\n",
 -                ifaces[i].name,
 -                vpx_codec_iface_name(ifaces[i].iface));
 -
 -    exit(EXIT_FAILURE);
 +  fprintf(stderr,
 +          "\nOutput File Patterns:\n\n"
 +          "  The -o argument specifies the name of the file(s) to "
 +          "write to. If the\n  argument does not include any escape "
 +          "characters, the output will be\n  written to a single file. "
 +          "Otherwise, the filename will be calculated by\n  expanding "
 +          "the following escape characters:\n");
 +  fprintf(stderr,
 +          "\n\t%%w   - Frame width"
 +          "\n\t%%h   - Frame height"
 +          "\n\t%%<n> - Frame number, zero padded to <n> places (1..9)"
 +          "\n\n  Pattern arguments are only supported in conjunction "
 +          "with the --yv12 and\n  --i420 options. If the -o option is "
 +          "not specified, the output will be\n  directed to stdout.\n"
 +         );
 +  fprintf(stderr, "\nIncluded decoders:\n\n");
 +
 +  for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
 +    fprintf(stderr, "    %-6s - %s\n",
 +            ifaces[i].name,
 +            vpx_codec_iface_name(ifaces[i].iface()));
 +
 +  exit(EXIT_FAILURE);
  }
  
 -void die(const char *fmt, ...)
 -{
 -    va_list ap;
 -    va_start(ap, fmt);
 -    vfprintf(stderr, fmt, ap);
 -    fprintf(stderr, "\n");
 -    usage_exit();
 +void die(const char *fmt, ...) {
 +  va_list ap;
 +  va_start(ap, fmt);
 +  vfprintf(stderr, fmt, ap);
 +  fprintf(stderr, "\n");
 +  usage_exit();
  }
  
 -static unsigned int mem_get_le16(const void *vmem)
 -{
 -    unsigned int  val;
 -    const unsigned char *mem = (const unsigned char *)vmem;
 +static unsigned int mem_get_le16(const void *vmem) {
 +  unsigned int  val;
 +  const unsigned char *mem = (const unsigned char *)vmem;
  
 -    val = mem[1] << 8;
 -    val |= mem[0];
 -    return val;
 +  val = mem[1] << 8;
 +  val |= mem[0];
 +  return val;
  }
  
 -static unsigned int mem_get_le32(const void *vmem)
 -{
 -    unsigned int  val;
 -    const unsigned char *mem = (const unsigned char *)vmem;
 +static unsigned int mem_get_le32(const void *vmem) {
 +  unsigned int  val;
 +  const unsigned char *mem = (const unsigned char *)vmem;
  
 -    val = mem[3] << 24;
 -    val |= mem[2] << 16;
 -    val |= mem[1] << 8;
 -    val |= mem[0];
 -    return val;
 +  val = mem[3] << 24;
 +  val |= mem[2] << 16;
 +  val |= mem[1] << 8;
 +  val |= mem[0];
 +  return val;
  }
  
 -enum file_kind
 -{
 -    RAW_FILE,
 -    IVF_FILE,
 -    WEBM_FILE
 +enum file_kind {
 +  RAW_FILE,
 +  IVF_FILE,
 +  WEBM_FILE
  };
  
 -struct input_ctx
 -{
 -    enum file_kind  kind;
 -    FILE           *infile;
 -    nestegg        *nestegg_ctx;
 -    nestegg_packet *pkt;
 -    unsigned int    chunk;
 -    unsigned int    chunks;
 -    unsigned int    video_track;
 +struct input_ctx {
 +  enum file_kind  kind;
 +  FILE           *infile;
 +  nestegg        *nestegg_ctx;
 +  nestegg_packet *pkt;
 +  unsigned int    chunk;
 +  unsigned int    chunks;
 +  unsigned int    video_track;
  };
  
  #define IVF_FRAME_HDR_SZ (sizeof(uint32_t) + sizeof(uint64_t))
@@@ -663,401 -697,447 +666,401 @@@ void generate_filename(const char *patt
  }
  
  
 -int main(int argc, const char **argv_)
 -{
 -    vpx_codec_ctx_t          decoder;
 -    char                  *fn = NULL;
 -    int                    i;
 -    uint8_t               *buf = NULL;
 -    size_t                 buf_sz = 0, buf_alloc_sz = 0;
 -    FILE                  *infile;
 -    int                    frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0, do_md5 = 0, progress = 0;
 -    int                    stop_after = 0, postproc = 0, summary = 0, quiet = 1;
 -    int                    ec_enabled = 0;
 -    vpx_codec_iface_t       *iface = NULL;
 -    unsigned int           fourcc;
 -    unsigned long          dx_time = 0;
 -    struct arg               arg;
 -    char                   **argv, **argi, **argj;
 -    const char             *outfile_pattern = 0;
 -    char                    outfile[PATH_MAX];
 -    int                     single_file;
 -    int                     use_y4m = 1;
 -    unsigned int            width;
 -    unsigned int            height;
 -    unsigned int            fps_den;
 -    unsigned int            fps_num;
 -    void                   *out = NULL;
 -    vpx_codec_dec_cfg_t     cfg = {0};
 -#if CONFIG_VP8_DECODER
 -    vp8_postproc_cfg_t      vp8_pp_cfg = {0};
 -    int                     vp8_dbg_color_ref_frame = 0;
 -    int                     vp8_dbg_color_mb_modes = 0;
 -    int                     vp8_dbg_color_b_modes = 0;
 -    int                     vp8_dbg_display_mv = 0;
 +int main(int argc, const char **argv_) {
 +  vpx_codec_ctx_t          decoder;
 +  char                  *fn = NULL;
 +  int                    i;
 +  uint8_t               *buf = NULL;
 +  size_t                 buf_sz = 0, buf_alloc_sz = 0;
 +  FILE                  *infile;
 +  int                    frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0, do_md5 = 0, progress = 0;
 +  int                    stop_after = 0, postproc = 0, summary = 0, quiet = 1;
 +  int                    arg_skip = 0;
 +  int                    ec_enabled = 0;
 +  vpx_codec_iface_t       *iface = NULL;
 +  unsigned int           fourcc;
 +  unsigned long          dx_time = 0;
 +  struct arg               arg;
 +  char                   **argv, **argi, **argj;
 +  const char             *outfile_pattern = 0;
 +  char                    outfile[PATH_MAX];
 +  int                     single_file;
 +  int                     use_y4m = 1;
 +  unsigned int            width;
 +  unsigned int            height;
 +  unsigned int            fps_den;
 +  unsigned int            fps_num;
 +  void                   *out = NULL;
 +  vpx_codec_dec_cfg_t     cfg = {0};
- #if CONFIG_VP9_DECODER
++#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
 +  vp8_postproc_cfg_t      vp8_pp_cfg = {0};
 +  int                     vp8_dbg_color_ref_frame = 0;
 +  int                     vp8_dbg_color_mb_modes = 0;
 +  int                     vp8_dbg_color_b_modes = 0;
 +  int                     vp8_dbg_display_mv = 0;
  #endif
 -    struct input_ctx        input = {0};
 -    int                     frames_corrupted = 0;
 -    int                     dec_flags = 0;
 -
 -    /* Parse command line */
 -    exec_name = argv_[0];
 -    argv = argv_dup(argc - 1, argv_ + 1);
 -
 -    for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step)
 -    {
 -        memset(&arg, 0, sizeof(arg));
 -        arg.argv_step = 1;
 -
 -        if (arg_match(&arg, &codecarg, argi))
 -        {
 -            int j, k = -1;
 -
 -            for (j = 0; j < sizeof(ifaces) / sizeof(ifaces[0]); j++)
 -                if (!strcmp(ifaces[j].name, arg.val))
 -                    k = j;
 -
 -            if (k >= 0)
 -                iface = ifaces[k].iface;
 -            else
 -                die("Error: Unrecognized argument (%s) to --codec\n",
 -                    arg.val);
 -        }
 -        else if (arg_match(&arg, &outputfile, argi))
 -            outfile_pattern = arg.val;
 -        else if (arg_match(&arg, &use_yv12, argi))
 -        {
 -            use_y4m = 0;
 -            flipuv = 1;
 -        }
 -        else if (arg_match(&arg, &use_i420, argi))
 -        {
 -            use_y4m = 0;
 -            flipuv = 0;
 -        }
 -        else if (arg_match(&arg, &flipuvarg, argi))
 -            flipuv = 1;
 -        else if (arg_match(&arg, &noblitarg, argi))
 -            noblit = 1;
 -        else if (arg_match(&arg, &progressarg, argi))
 -            progress = 1;
 -        else if (arg_match(&arg, &limitarg, argi))
 -            stop_after = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &postprocarg, argi))
 -            postproc = 1;
 -        else if (arg_match(&arg, &md5arg, argi))
 -            do_md5 = 1;
 -        else if (arg_match(&arg, &summaryarg, argi))
 -            summary = 1;
 -        else if (arg_match(&arg, &threadsarg, argi))
 -            cfg.threads = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &verbosearg, argi))
 -            quiet = 0;
 -
 -#if CONFIG_VP8_DECODER
 -        else if (arg_match(&arg, &addnoise_level, argi))
 -        {
 -            postproc = 1;
 -            vp8_pp_cfg.post_proc_flag |= VP8_ADDNOISE;
 -            vp8_pp_cfg.noise_level = arg_parse_uint(&arg);
 -        }
 -        else if (arg_match(&arg, &demacroblock_level, argi))
 -        {
 -            postproc = 1;
 -            vp8_pp_cfg.post_proc_flag |= VP8_DEMACROBLOCK;
 -            vp8_pp_cfg.deblocking_level = arg_parse_uint(&arg);
 -        }
 -        else if (arg_match(&arg, &deblock, argi))
 -        {
 -            postproc = 1;
 -            vp8_pp_cfg.post_proc_flag |= VP8_DEBLOCK;
 -        }
 -        else if (arg_match(&arg, &mfqe, argi))
 -        {
 -            postproc = 1;
 -            vp8_pp_cfg.post_proc_flag |= VP8_MFQE;
 -        }
 -        else if (arg_match(&arg, &pp_debug_info, argi))
 -        {
 -            unsigned int level = arg_parse_uint(&arg);
 -
 -            postproc = 1;
 -            vp8_pp_cfg.post_proc_flag &= ~0x7;
 -
 -            if (level)
 -                vp8_pp_cfg.post_proc_flag |= level;
 -        }
 -        else if (arg_match(&arg, &pp_disp_ref_frame, argi))
 -        {
 -            unsigned int flags = arg_parse_int(&arg);
 -            if (flags)
 -            {
 -                postproc = 1;
 -                vp8_dbg_color_ref_frame = flags;
 -            }
 -        }
 -        else if (arg_match(&arg, &pp_disp_mb_modes, argi))
 -        {
 -            unsigned int flags = arg_parse_int(&arg);
 -            if (flags)
 -            {
 -                postproc = 1;
 -                vp8_dbg_color_mb_modes = flags;
 -            }
 -        }
 -        else if (arg_match(&arg, &pp_disp_b_modes, argi))
 -        {
 -            unsigned int flags = arg_parse_int(&arg);
 -            if (flags)
 -            {
 -                postproc = 1;
 -                vp8_dbg_color_b_modes = flags;
 -            }
 -        }
 -        else if (arg_match(&arg, &pp_disp_mvs, argi))
 -        {
 -            unsigned int flags = arg_parse_int(&arg);
 -            if (flags)
 -            {
 -                postproc = 1;
 -                vp8_dbg_display_mv = flags;
 -            }
 -        }
 -        else if (arg_match(&arg, &error_concealment, argi))
 -        {
 -            ec_enabled = 1;
 -        }
 +  struct input_ctx        input = {0};
 +  int                     frames_corrupted = 0;
 +  int                     dec_flags = 0;
 +
 +  /* Parse command line */
 +  exec_name = argv_[0];
 +  argv = argv_dup(argc - 1, argv_ + 1);
 +
 +  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
 +    memset(&arg, 0, sizeof(arg));
 +    arg.argv_step = 1;
 +
 +    if (arg_match(&arg, &codecarg, argi)) {
 +      int j, k = -1;
 +
 +      for (j = 0; j < sizeof(ifaces) / sizeof(ifaces[0]); j++)
 +        if (!strcmp(ifaces[j].name, arg.val))
 +          k = j;
 +
 +      if (k >= 0)
 +        iface = ifaces[k].iface();
 +      else
 +        die("Error: Unrecognized argument (%s) to --codec\n",
 +            arg.val);
 +    } else if (arg_match(&arg, &outputfile, argi))
 +      outfile_pattern = arg.val;
 +    else if (arg_match(&arg, &use_yv12, argi)) {
 +      use_y4m = 0;
 +      flipuv = 1;
 +    } else if (arg_match(&arg, &use_i420, argi)) {
 +      use_y4m = 0;
 +      flipuv = 0;
 +    } else if (arg_match(&arg, &flipuvarg, argi))
 +      flipuv = 1;
 +    else if (arg_match(&arg, &noblitarg, argi))
 +      noblit = 1;
 +    else if (arg_match(&arg, &progressarg, argi))
 +      progress = 1;
 +    else if (arg_match(&arg, &limitarg, argi))
 +      stop_after = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &skiparg, argi))
 +      arg_skip = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &postprocarg, argi))
 +      postproc = 1;
 +    else if (arg_match(&arg, &md5arg, argi))
 +      do_md5 = 1;
 +    else if (arg_match(&arg, &summaryarg, argi))
 +      summary = 1;
 +    else if (arg_match(&arg, &threadsarg, argi))
 +      cfg.threads = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &verbosearg, argi))
 +      quiet = 0;
 +
- #if CONFIG_VP9_DECODER
++#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
 +    else if (arg_match(&arg, &addnoise_level, argi)) {
 +      postproc = 1;
 +      vp8_pp_cfg.post_proc_flag |= VP8_ADDNOISE;
 +      vp8_pp_cfg.noise_level = arg_parse_uint(&arg);
 +    } else if (arg_match(&arg, &demacroblock_level, argi)) {
 +      postproc = 1;
 +      vp8_pp_cfg.post_proc_flag |= VP8_DEMACROBLOCK;
 +      vp8_pp_cfg.deblocking_level = arg_parse_uint(&arg);
 +    } else if (arg_match(&arg, &deblock, argi)) {
 +      postproc = 1;
 +      vp8_pp_cfg.post_proc_flag |= VP8_DEBLOCK;
 +    } else if (arg_match(&arg, &mfqe, argi)) {
 +      postproc = 1;
 +      vp8_pp_cfg.post_proc_flag |= VP8_MFQE;
 +    } else if (arg_match(&arg, &pp_debug_info, argi)) {
 +      unsigned int level = arg_parse_uint(&arg);
 +
 +      postproc = 1;
 +      vp8_pp_cfg.post_proc_flag &= ~0x7;
 +
 +      if (level)
 +        vp8_pp_cfg.post_proc_flag |= level;
 +    } else if (arg_match(&arg, &pp_disp_ref_frame, argi)) {
 +      unsigned int flags = arg_parse_int(&arg);
 +      if (flags) {
 +        postproc = 1;
 +        vp8_dbg_color_ref_frame = flags;
 +      }
 +    } else if (arg_match(&arg, &pp_disp_mb_modes, argi)) {
 +      unsigned int flags = arg_parse_int(&arg);
 +      if (flags) {
 +        postproc = 1;
 +        vp8_dbg_color_mb_modes = flags;
 +      }
 +    } else if (arg_match(&arg, &pp_disp_b_modes, argi)) {
 +      unsigned int flags = arg_parse_int(&arg);
 +      if (flags) {
 +        postproc = 1;
 +        vp8_dbg_color_b_modes = flags;
 +      }
 +    } else if (arg_match(&arg, &pp_disp_mvs, argi)) {
 +      unsigned int flags = arg_parse_int(&arg);
 +      if (flags) {
 +        postproc = 1;
 +        vp8_dbg_display_mv = flags;
 +      }
 +    } else if (arg_match(&arg, &error_concealment, argi)) {
 +      ec_enabled = 1;
 +    }
  
  #endif
 -        else
 -            argj++;
 -    }
 +    else
 +      argj++;
 +  }
  
 -    /* Check for unrecognized options */
 -    for (argi = argv; *argi; argi++)
 -        if (argi[0][0] == '-' && strlen(argi[0]) > 1)
 -            die("Error: Unrecognized option %s\n", *argi);
 +  /* Check for unrecognized options */
 +  for (argi = argv; *argi; argi++)
 +    if (argi[0][0] == '-' && strlen(argi[0]) > 1)
 +      die("Error: Unrecognized option %s\n", *argi);
  
 -    /* Handle non-option arguments */
 -    fn = argv[0];
 +  /* Handle non-option arguments */
 +  fn = argv[0];
  
 -    if (!fn)
 -        usage_exit();
 +  if (!fn)
 +    usage_exit();
  
 -    /* Open file */
 -    infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin);
 +  /* Open file */
 +  infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin);
  
 -    if (!infile)
 -    {
 -        fprintf(stderr, "Failed to open file '%s'",
 -                strcmp(fn, "-") ? fn : "stdin");
 -        return EXIT_FAILURE;
 -    }
 +  if (!infile) {
 +    fprintf(stderr, "Failed to open file '%s'",
 +            strcmp(fn, "-") ? fn : "stdin");
 +    return EXIT_FAILURE;
 +  }
  #if CONFIG_OS_SUPPORT
 -    /* Make sure we don't dump to the terminal, unless forced to with -o - */
 -    if(!outfile_pattern && isatty(fileno(stdout)) && !do_md5 && !noblit)
 -    {
 -        fprintf(stderr,
 -                "Not dumping raw video to your terminal. Use '-o -' to "
 -                "override.\n");
 -        return EXIT_FAILURE;
 -    }
 +  /* Make sure we don't dump to the terminal, unless forced to with -o - */
 +  if (!outfile_pattern && isatty(fileno(stdout)) && !do_md5 && !noblit) {
 +    fprintf(stderr,
 +            "Not dumping raw video to your terminal. Use '-o -' to "
 +            "override.\n");
 +    return EXIT_FAILURE;
 +  }
  #endif
 -    input.infile = infile;
 -    if(file_is_ivf(infile, &fourcc, &width, &height, &fps_den,
 -                   &fps_num))
 -        input.kind = IVF_FILE;
 -    else if(file_is_webm(&input, &fourcc, &width, &height, &fps_den, &fps_num))
 -        input.kind = WEBM_FILE;
 -    else if(file_is_raw(infile, &fourcc, &width, &height, &fps_den, &fps_num))
 -        input.kind = RAW_FILE;
 -    else
 -    {
 -        fprintf(stderr, "Unrecognized input file type.\n");
 -        return EXIT_FAILURE;
 +  input.infile = infile;
 +  if (file_is_ivf(infile, &fourcc, &width, &height, &fps_den,
 +                  &fps_num))
 +    input.kind = IVF_FILE;
 +  else if (file_is_webm(&input, &fourcc, &width, &height, &fps_den, &fps_num))
 +    input.kind = WEBM_FILE;
 +  else if (file_is_raw(infile, &fourcc, &width, &height, &fps_den, &fps_num))
 +    input.kind = RAW_FILE;
 +  else {
 +    fprintf(stderr, "Unrecognized input file type.\n");
 +    return EXIT_FAILURE;
 +  }
 +
 +  /* If the output file is not set or doesn't have a sequence number in
 +   * it, then we only open it once.
 +   */
 +  outfile_pattern = outfile_pattern ? outfile_pattern : "-";
 +  single_file = 1;
 +  {
 +    const char *p = outfile_pattern;
 +    do {
 +      p = strchr(p, '%');
 +      if (p && p[1] >= '1' && p[1] <= '9') {
 +        /* pattern contains sequence number, so it's not unique. */
 +        single_file = 0;
 +        break;
 +      }
 +      if (p)
 +        p++;
 +    } while (p);
 +  }
 +
 +  if (single_file && !noblit) {
 +    generate_filename(outfile_pattern, outfile, sizeof(outfile) - 1,
 +                      width, height, 0);
 +    out = out_open(outfile, do_md5);
 +  }
 +
 +  if (use_y4m && !noblit) {
 +    char buffer[128];
 +    if (!single_file) {
 +      fprintf(stderr, "YUV4MPEG2 not supported with output patterns,"
 +              " try --i420 or --yv12.\n");
 +      return EXIT_FAILURE;
      }
  
 -    /* If the output file is not set or doesn't have a sequence number in
 -     * it, then we only open it once.
 -     */
 -    outfile_pattern = outfile_pattern ? outfile_pattern : "-";
 -    single_file = 1;
 -    {
 -        const char *p = outfile_pattern;
 -        do
 -        {
 -            p = strchr(p, '%');
 -            if(p && p[1] >= '1' && p[1] <= '9')
 -            {
 -                /* pattern contains sequence number, so it's not unique. */
 -                single_file = 0;
 -                break;
 -            }
 -            if(p)
 -                p++;
 -        } while(p);
 +    if (input.kind == WEBM_FILE)
 +      if (webm_guess_framerate(&input, &fps_den, &fps_num)) {
 +        fprintf(stderr, "Failed to guess framerate -- error parsing "
 +                "webm file?\n");
 +        return EXIT_FAILURE;
 +      }
 +
 +
 +    /*Note: We can't output an aspect ratio here because IVF doesn't
 +       store one, and neither does VP8.
 +      That will have to wait until these tools support WebM natively.*/
 +    sprintf(buffer, "YUV4MPEG2 C%s W%u H%u F%u:%u I%c\n",
 +            "420jpeg", width, height, fps_num, fps_den, 'p');
 +    out_put(out, (unsigned char *)buffer,
 +            (unsigned int)strlen(buffer), do_md5);
 +  }
 +
 +  /* Try to determine the codec from the fourcc. */
 +  for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
 +    if ((fourcc & ifaces[i].fourcc_mask) == ifaces[i].fourcc) {
 +      vpx_codec_iface_t  *ivf_iface = ifaces[i].iface();
 +
 +      if (iface && iface != ivf_iface)
 +        fprintf(stderr, "Notice -- IVF header indicates codec: %s\n",
 +                ifaces[i].name);
 +      else
 +        iface = ivf_iface;
 +
 +      break;
      }
  
 -    if(single_file && !noblit)
 -    {
 -        generate_filename(outfile_pattern, outfile, sizeof(outfile)-1,
 -                          width, height, 0);
 -        out = out_open(outfile, do_md5);
 -    }
 +  dec_flags = (postproc ? VPX_CODEC_USE_POSTPROC : 0) |
 +              (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0);
 +  if (vpx_codec_dec_init(&decoder, iface ? iface :  ifaces[0].iface(), &cfg,
 +                         dec_flags)) {
 +    fprintf(stderr, "Failed to initialize decoder: %s\n", vpx_codec_error(&decoder));
 +    return EXIT_FAILURE;
 +  }
 +
 +  if (!quiet)
 +    fprintf(stderr, "%s\n", decoder.name);
 +
- #if CONFIG_VP9_DECODER
++#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
 +
 +  if (vp8_pp_cfg.post_proc_flag
 +      && vpx_codec_control(&decoder, VP8_SET_POSTPROC, &vp8_pp_cfg)) {
 +    fprintf(stderr, "Failed to configure postproc: %s\n", vpx_codec_error(&decoder));
 +    return EXIT_FAILURE;
 +  }
 +
 +  if (vp8_dbg_color_ref_frame
 +      && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME, vp8_dbg_color_ref_frame)) {
 +    fprintf(stderr, "Failed to configure reference block visualizer: %s\n", vpx_codec_error(&decoder));
 +    return EXIT_FAILURE;
 +  }
 +
 +  if (vp8_dbg_color_mb_modes
 +      && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES, vp8_dbg_color_mb_modes)) {
 +    fprintf(stderr, "Failed to configure macro block visualizer: %s\n", vpx_codec_error(&decoder));
 +    return EXIT_FAILURE;
 +  }
 +
 +  if (vp8_dbg_color_b_modes
 +      && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES, vp8_dbg_color_b_modes)) {
 +    fprintf(stderr, "Failed to configure block visualizer: %s\n", vpx_codec_error(&decoder));
 +    return EXIT_FAILURE;
 +  }
 +
 +  if (vp8_dbg_display_mv
 +      && vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV, vp8_dbg_display_mv)) {
 +    fprintf(stderr, "Failed to configure motion vector visualizer: %s\n", vpx_codec_error(&decoder));
 +    return EXIT_FAILURE;
 +  }
 +#endif
  
 -    if (use_y4m && !noblit)
 -    {
 -        char buffer[128];
 -        if (!single_file)
 -        {
 -            fprintf(stderr, "YUV4MPEG2 not supported with output patterns,"
 -                            " try --i420 or --yv12.\n");
 -            return EXIT_FAILURE;
 -        }
  
 -        if(input.kind == WEBM_FILE)
 -            if(webm_guess_framerate(&input, &fps_den, &fps_num))
 -            {
 -                fprintf(stderr, "Failed to guess framerate -- error parsing "
 -                                "webm file?\n");
 -                return EXIT_FAILURE;
 -            }
 -
 -
 -        /*Note: We can't output an aspect ratio here because IVF doesn't
 -           store one, and neither does VP8.
 -          That will have to wait until these tools support WebM natively.*/
 -        sprintf(buffer, "YUV4MPEG2 C%s W%u H%u F%u:%u I%c\n",
 -                "420jpeg", width, height, fps_num, fps_den, 'p');
 -        out_put(out, (unsigned char *)buffer,
 -                (unsigned int)strlen(buffer), do_md5);
 -    }
 +  if(arg_skip)
 +    fprintf(stderr, "Skiping first %d frames.\n", arg_skip);
 +  while (arg_skip) {
 +    if (read_frame(&input, &buf, &buf_sz, &buf_alloc_sz))
 +      break;
 +    arg_skip--;
 +  }
  
 -    /* Try to determine the codec from the fourcc. */
 -    for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
 -        if ((fourcc & ifaces[i].fourcc_mask) == ifaces[i].fourcc)
 -        {
 -            vpx_codec_iface_t  *ivf_iface = ifaces[i].iface;
 +  /* Decode file */
 +  while (!read_frame(&input, &buf, &buf_sz, &buf_alloc_sz)) {
 +    vpx_codec_iter_t  iter = NULL;
 +    vpx_image_t    *img;
 +    struct vpx_usec_timer timer;
 +    int                   corrupted;
  
 -            if (iface && iface != ivf_iface)
 -                fprintf(stderr, "Notice -- IVF header indicates codec: %s\n",
 -                        ifaces[i].name);
 -            else
 -                iface = ivf_iface;
 +    vpx_usec_timer_start(&timer);
  
 -            break;
 -        }
 +    if (vpx_codec_decode(&decoder, buf, (unsigned int)buf_sz, NULL, 0)) {
 +      const char *detail = vpx_codec_error_detail(&decoder);
 +      fprintf(stderr, "Failed to decode frame: %s\n", vpx_codec_error(&decoder));
  
 -    dec_flags = (postproc ? VPX_CODEC_USE_POSTPROC : 0) |
 -                (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0);
 -    if (vpx_codec_dec_init(&decoder, iface ? iface :  ifaces[0].iface, &cfg,
 -                           dec_flags))
 -    {
 -        fprintf(stderr, "Failed to initialize decoder: %s\n", vpx_codec_error(&decoder));
 -        return EXIT_FAILURE;
 +      if (detail)
 +        fprintf(stderr, "  Additional information: %s\n", detail);
 +
 +      goto fail;
      }
  
 -    if (!quiet)
 -        fprintf(stderr, "%s\n", decoder.name);
 +    vpx_usec_timer_mark(&timer);
 +    dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
  
 -#if CONFIG_VP8_DECODER
 +    ++frame_in;
  
 -    if (vp8_pp_cfg.post_proc_flag
 -        && vpx_codec_control(&decoder, VP8_SET_POSTPROC, &vp8_pp_cfg))
 -    {
 -        fprintf(stderr, "Failed to configure postproc: %s\n", vpx_codec_error(&decoder));
 -        return EXIT_FAILURE;
 +    if (vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
 +      fprintf(stderr, "Failed VP8_GET_FRAME_CORRUPTED: %s\n",
 +              vpx_codec_error(&decoder));
 +      goto fail;
      }
 +    frames_corrupted += corrupted;
  
 -    if (vp8_dbg_color_ref_frame
 -        && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME, vp8_dbg_color_ref_frame))
 -    {
 -        fprintf(stderr, "Failed to configure reference block visualizer: %s\n", vpx_codec_error(&decoder));
 -        return EXIT_FAILURE;
 -    }
 +    vpx_usec_timer_start(&timer);
  
 -    if (vp8_dbg_color_mb_modes
 -        && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES, vp8_dbg_color_mb_modes))
 -    {
 -        fprintf(stderr, "Failed to configure macro block visualizer: %s\n", vpx_codec_error(&decoder));
 -        return EXIT_FAILURE;
 -    }
 +    if ((img = vpx_codec_get_frame(&decoder, &iter)))
 +      ++frame_out;
  
 -    if (vp8_dbg_color_b_modes
 -        && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES, vp8_dbg_color_b_modes))
 -    {
 -        fprintf(stderr, "Failed to configure block visualizer: %s\n", vpx_codec_error(&decoder));
 -        return EXIT_FAILURE;
 -    }
 +    vpx_usec_timer_mark(&timer);
 +    dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
  
 -    if (vp8_dbg_display_mv
 -        && vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV, vp8_dbg_display_mv))
 -    {
 -        fprintf(stderr, "Failed to configure motion vector visualizer: %s\n", vpx_codec_error(&decoder));
 -        return EXIT_FAILURE;
 -    }
 -#endif
 +    if (progress)
 +      show_progress(frame_in, frame_out, dx_time);
  
 -    /* Decode file */
 -    while (!read_frame(&input, &buf, &buf_sz, &buf_alloc_sz))
 -    {
 -        vpx_codec_iter_t  iter = NULL;
 -        vpx_image_t    *img;
 -        struct vpx_usec_timer timer;
 -        int                   corrupted;
 +    if (!noblit) {
 +      if (img) {
 +        unsigned int y;
 +        char out_fn[PATH_MAX];
 +        uint8_t *buf;
  
 -        vpx_usec_timer_start(&timer);
 +        if (!single_file) {
 +          size_t len = sizeof(out_fn) - 1;
  
 -        if (vpx_codec_decode(&decoder, buf, (unsigned int)buf_sz, NULL, 0))
 -        {
 -            const char *detail = vpx_codec_error_detail(&decoder);
 -            fprintf(stderr, "Failed to decode frame: %s\n", vpx_codec_error(&decoder));
 +          out_fn[len] = '\0';
 +          generate_filename(outfile_pattern, out_fn, len - 1,
 +                            img->d_w, img->d_h, frame_in);
 +          out = out_open(out_fn, do_md5);
 +        } else if (use_y4m)
 +          out_put(out, (unsigned char *)"FRAME\n", 6, do_md5);
  
 -            if (detail)
 -                fprintf(stderr, "  Additional information: %s\n", detail);
 +        buf = img->planes[VPX_PLANE_Y];
  
 -            goto fail;
 +        for (y = 0; y < img->d_h; y++) {
 +          out_put(out, buf, img->d_w, do_md5);
 +          buf += img->stride[VPX_PLANE_Y];
          }
  
 -        vpx_usec_timer_mark(&timer);
 -        dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
 +        buf = img->planes[flipuv ? VPX_PLANE_V : VPX_PLANE_U];
  
 -        ++frame_in;
 -
 -        if (vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted))
 -        {
 -            fprintf(stderr, "Failed VP8_GET_FRAME_CORRUPTED: %s\n",
 -                    vpx_codec_error(&decoder));
 -            goto fail;
 +        for (y = 0; y < (1 + img->d_h) / 2; y++) {
 +          out_put(out, buf, (1 + img->d_w) / 2, do_md5);
 +          buf += img->stride[VPX_PLANE_U];
          }
 -        frames_corrupted += corrupted;
 -
 -        vpx_usec_timer_start(&timer);
 -
 -        if ((img = vpx_codec_get_frame(&decoder, &iter)))
 -            ++frame_out;
 -
 -        vpx_usec_timer_mark(&timer);
 -        dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
 -
 -        if (progress)
 -            show_progress(frame_in, frame_out, dx_time);
 -
 -        if (!noblit)
 -        {
 -            if (img)
 -            {
 -                unsigned int y;
 -                char out_fn[PATH_MAX];
 -                uint8_t *buf;
 -
 -                if (!single_file)
 -                {
 -                    size_t len = sizeof(out_fn)-1;
 -
 -                    out_fn[len] = '\0';
 -                    generate_filename(outfile_pattern, out_fn, len-1,
 -                                      img->d_w, img->d_h, frame_in);
 -                    out = out_open(out_fn, do_md5);
 -                }
 -                else if(use_y4m)
 -                    out_put(out, (unsigned char *)"FRAME\n", 6, do_md5);
 -
 -                buf = img->planes[VPX_PLANE_Y];
 -
 -                for (y = 0; y < img->d_h; y++)
 -                {
 -                    out_put(out, buf, img->d_w, do_md5);
 -                    buf += img->stride[VPX_PLANE_Y];
 -                }
 -
 -                buf = img->planes[flipuv?VPX_PLANE_V:VPX_PLANE_U];
 -
 -                for (y = 0; y < (1 + img->d_h) / 2; y++)
 -                {
 -                    out_put(out, buf, (1 + img->d_w) / 2, do_md5);
 -                    buf += img->stride[VPX_PLANE_U];
 -                }
 -
 -                buf = img->planes[flipuv?VPX_PLANE_U:VPX_PLANE_V];
 -
 -                for (y = 0; y < (1 + img->d_h) / 2; y++)
 -                {
 -                    out_put(out, buf, (1 + img->d_w) / 2, do_md5);
 -                    buf += img->stride[VPX_PLANE_V];
 -                }
 -
 -                if (!single_file)
 -                    out_close(out, out_fn, do_md5);
 -            }
 +
 +        buf = img->planes[flipuv ? VPX_PLANE_U : VPX_PLANE_V];
 +
 +        for (y = 0; y < (1 + img->d_h) / 2; y++) {
 +          out_put(out, buf, (1 + img->d_w) / 2, do_md5);
 +          buf += img->stride[VPX_PLANE_V];
          }
  
 -        if (stop_after && frame_in >= stop_after)
 -            break;
 +        if (!single_file)
 +          out_close(out, out_fn, do_md5);
 +      }
      }
  
 -    if (summary || progress)
 -    {
 -        show_progress(frame_in, frame_out, dx_time);
 -        fprintf(stderr, "\n");
 -    }
 +    if (stop_after && frame_in >= stop_after)
 +      break;
 +  }
  
 -    if (frames_corrupted)
 -        fprintf(stderr, "WARNING: %d frames corrupted.\n",frames_corrupted);
 +  if (summary || progress) {
 +    show_progress(frame_in, frame_out, dx_time);
 +    fprintf(stderr, "\n");
 +  }
 +
 +  if (frames_corrupted)
 +    fprintf(stderr, "WARNING: %d frames corrupted.\n", frames_corrupted);
  
  fail:
  
diff --cc vpxenc.c
index 484eb16c3,7449e6c6c..6688231d7
--- a/vpxenc.c
+++ b/vpxenc.c
@@@ -31,14 -32,7 +31,14 @@@
  #include <fcntl.h>
  #include <unistd.h>
  #endif
 +
- #if CONFIG_VP9_ENCODER
++#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
  #include "vpx/vp8cx.h"
 +#endif
- #if CONFIG_VP9_DECODER
++#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
 +#include "vpx/vp8dx.h"
 +#endif
 +
  #include "vpx_ports/mem_ops.h"
  #include "vpx_ports/vpx_timer.h"
  #include "tools_common.h"
@@@ -86,17 -82,15 +86,21 @@@ static size_t wrap_fwrite(const void *p
  
  static const char *exec_name;
  
 -static const struct codec_item
 -{
 -    char const              *name;
 -    vpx_codec_iface_t       *iface;
 -    unsigned int             fourcc;
 -} codecs[] =
 -{
 -#if CONFIG_VP8_ENCODER
 -    {"vp8",  &vpx_codec_vp8_cx_algo, 0x30385056},
 +static const struct codec_item {
 +  char const              *name;
 +  const vpx_codec_iface_t *(*iface)(void);
 +  const vpx_codec_iface_t *(*dx_iface)(void);
 +  unsigned int             fourcc;
 +} codecs[] = {
++#if CONFIG_VP8_ENCODER && CONFIG_VP8_DECODER
++  {"vp8", &vpx_codec_vp8_cx, &vpx_codec_vp8_dx, 0x30385056},
++#elif CONFIG_VP9_ENCODER && !CONFIG_VP9_DECODER
++  {"vp8", &vpx_codec_vp8_cx, NULL, 0x30385056},
++#endif
 +#if CONFIG_VP9_ENCODER && CONFIG_VP9_DECODER
 +  {"vp9", &vpx_codec_vp8_cx, &vpx_codec_vp8_dx, 0x30385056},
- #endif
- #if CONFIG_VP9_ENCODER && !CONFIG_VP9_DECODER
++#elif CONFIG_VP9_ENCODER && !CONFIG_VP9_DECODER
 +  {"vp9", &vpx_codec_vp8_cx, NULL, 0x30385056},
  #endif
  };
  
@@@ -1054,104 -1113,96 +1058,104 @@@ static const arg_def_t *kf_args[] = 
  };
  
  
- #if CONFIG_VP9_ENCODER
 -#if CONFIG_VP8_ENCODER
++#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
  static const arg_def_t noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1,
 -                                    "Noise sensitivity (frames to blur)");
 +                                            "Noise sensitivity (frames to blur)");
  static const arg_def_t sharpness = ARG_DEF(NULL, "sharpness", 1,
 -                                   "Filter sharpness (0-7)");
 +                                           "Filter sharpness (0-7)");
  static const arg_def_t static_thresh = ARG_DEF(NULL, "static-thresh", 1,
 -                                       "Motion detection threshold");
 +                                               "Motion detection threshold");
  #endif
  
- #if CONFIG_VP9_ENCODER
 -#if CONFIG_VP8_ENCODER
++#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
  static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1,
 -                                  "CPU Used (-16..16)");
 +                                          "CPU Used (-16..16)");
  #endif
  
  
- #if CONFIG_VP9_ENCODER
 -#if CONFIG_VP8_ENCODER
++#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
  static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1,
 -                                     "Number of token partitions to use, log2");
 +                                             "Number of token partitions to use, log2");
  static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,
 -                                     "Enable automatic alt reference frames");
 +                                             "Enable automatic alt reference frames");
  static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1,
 -                                        "AltRef Max Frames");
 +                                                "AltRef Max Frames");
  static const arg_def_t arnr_strength = ARG_DEF(NULL, "arnr-strength", 1,
 -                                       "AltRef Strength");
 +                                               "AltRef Strength");
  static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1,
 -                                   "AltRef Type");
 +                                           "AltRef Type");
  static const struct arg_enum_list tuning_enum[] = {
 -    {"psnr", VP8_TUNE_PSNR},
 -    {"ssim", VP8_TUNE_SSIM},
 -    {NULL, 0}
 +  {"psnr", VP8_TUNE_PSNR},
 +  {"ssim", VP8_TUNE_SSIM},
 +  {NULL, 0}
  };
  static const arg_def_t tune_ssim = ARG_DEF_ENUM(NULL, "tune", 1,
 -                                   "Material to favor", tuning_enum);
 +                                                "Material to favor", tuning_enum);
  static const arg_def_t cq_level = ARG_DEF(NULL, "cq-level", 1,
 -                                   "Constrained Quality Level");
 +                                          "Constrained Quality Level");
  static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1,
 -        "Max I-frame bitrate (pct)");
 +                                                    "Max I-frame bitrate (pct)");
 +#if CONFIG_LOSSLESS
 +static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode");
 +#endif
  
 -static const arg_def_t *vp8_args[] =
 -{
 -    &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
 -    &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type,
 -    &tune_ssim, &cq_level, &max_intra_rate_pct, NULL
 +static const arg_def_t *vp8_args[] = {
 +  &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
 +  &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type,
 +  &tune_ssim, &cq_level, &max_intra_rate_pct,
 +#if CONFIG_LOSSLESS
 +  &lossless,
 +#endif
 +  NULL
  };
 -static const int vp8_arg_ctrl_map[] =
 -{
 -    VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF,
 -    VP8E_SET_NOISE_SENSITIVITY, VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD,
 -    VP8E_SET_TOKEN_PARTITIONS,
 -    VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH , VP8E_SET_ARNR_TYPE,
 -    VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT, 0
 +static const int vp8_arg_ctrl_map[] = {
 +  VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF,
 +  VP8E_SET_NOISE_SENSITIVITY, VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD,
 +  VP8E_SET_TOKEN_PARTITIONS,
 +  VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
 +  VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
 +#if CONFIG_LOSSLESS
 +  VP9E_SET_LOSSLESS,
 +#endif
 +  0
  };
  #endif
  
  static const arg_def_t *no_args[] = { NULL };
  
 -static void usage_exit()
 -{
 -    int i;
 -
 -    fprintf(stderr, "Usage: %s <options> -o dst_filename src_filename \n",
 -            exec_name);
 -
 -    fprintf(stderr, "\nOptions:\n");
 -    arg_show_usage(stdout, main_args);
 -    fprintf(stderr, "\nEncoder Global Options:\n");
 -    arg_show_usage(stdout, global_args);
 -    fprintf(stderr, "\nRate Control Options:\n");
 -    arg_show_usage(stdout, rc_args);
 -    fprintf(stderr, "\nTwopass Rate Control Options:\n");
 -    arg_show_usage(stdout, rc_twopass_args);
 -    fprintf(stderr, "\nKeyframe Placement Options:\n");
 -    arg_show_usage(stdout, kf_args);
 -#if CONFIG_VP8_ENCODER
 -    fprintf(stderr, "\nVP8 Specific Options:\n");
 -    arg_show_usage(stdout, vp8_args);
 +static void usage_exit() {
 +  int i;
 +
 +  fprintf(stderr, "Usage: %s <options> -o dst_filename src_filename \n",
 +          exec_name);
 +
 +  fprintf(stderr, "\nOptions:\n");
 +  arg_show_usage(stdout, main_args);
 +  fprintf(stderr, "\nEncoder Global Options:\n");
 +  arg_show_usage(stdout, global_args);
 +  fprintf(stderr, "\nRate Control Options:\n");
 +  arg_show_usage(stdout, rc_args);
 +  fprintf(stderr, "\nTwopass Rate Control Options:\n");
 +  arg_show_usage(stdout, rc_twopass_args);
 +  fprintf(stderr, "\nKeyframe Placement Options:\n");
 +  arg_show_usage(stdout, kf_args);
- #if CONFIG_VP9_ENCODER
++#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
 +  fprintf(stderr, "\nVP8 Specific Options:\n");
 +  arg_show_usage(stdout, vp8_args);
  #endif
 -    fprintf(stderr, "\nStream timebase (--timebase):\n"
 -            "  The desired precision of timestamps in the output, expressed\n"
 -            "  in fractional seconds. Default is 1/1000.\n");
 -    fprintf(stderr, "\n"
 -           "Included encoders:\n"
 -           "\n");
 -
 -    for (i = 0; i < sizeof(codecs) / sizeof(codecs[0]); i++)
 -        fprintf(stderr, "    %-6s - %s\n",
 -               codecs[i].name,
 -               vpx_codec_iface_name(codecs[i].iface));
 -
 -    exit(EXIT_FAILURE);
 +  fprintf(stderr, "\nStream timebase (--timebase):\n"
 +          "  The desired precision of timestamps in the output, expressed\n"
 +          "  in fractional seconds. Default is 1/1000.\n");
 +  fprintf(stderr, "\n"
 +          "Included encoders:\n"
 +          "\n");
 +
 +  for (i = 0; i < sizeof(codecs) / sizeof(codecs[0]); i++)
 +    fprintf(stderr, "    %-6s - %s\n",
 +            codecs[i].name,
 +            vpx_codec_iface_name(codecs[i].iface()));
 +
 +  exit(EXIT_FAILURE);
  }
  
  
@@@ -1725,139 -1790,159 +1729,139 @@@ static struct stream_state *new_stream(
  
  static int parse_stream_params(struct global_config *global,
                                 struct stream_state  *stream,
 -                               char **argv)
 -{
 -    char                   **argi, **argj;
 -    struct arg               arg;
 -    static const arg_def_t **ctrl_args = no_args;
 -    static const int        *ctrl_args_map = NULL;
 -    struct stream_config    *config = &stream->config;
 -    int                      eos_mark_found = 0;
 -
 -    /* Handle codec specific options */
 -    if (global->codec->iface == &vpx_codec_vp8_cx_algo)
 -    {
 -        ctrl_args = vp8_args;
 -        ctrl_args_map = vp8_arg_ctrl_map;
 +                               char **argv) {
 +  char                   **argi, **argj;
 +  struct arg               arg;
 +  static const arg_def_t **ctrl_args = no_args;
 +  static const int        *ctrl_args_map = NULL;
 +  struct stream_config    *config = &stream->config;
 +  int                      eos_mark_found = 0;
 +
 +  /* Handle codec specific options */
-   if (global->codec->iface == vpx_codec_vp8x_cx) {
++  if (global->codec->iface == vpx_codec_vp8_cx) {
 +    ctrl_args = vp8_args;
 +    ctrl_args_map = vp8_arg_ctrl_map;
 +  }
 +
 +  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
 +    arg.argv_step = 1;
 +
 +    /* Once we've found an end-of-stream marker (--) we want to continue
 +     * shifting arguments but not consuming them.
 +     */
 +    if (eos_mark_found) {
 +      argj++;
 +      continue;
 +    } else if (!strcmp(*argj, "--")) {
 +      eos_mark_found = 1;
 +      continue;
      }
  
 -    for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step)
 -    {
 -        arg.argv_step = 1;
 -
 -        /* Once we've found an end-of-stream marker (--) we want to continue
 -         * shifting arguments but not consuming them.
 -         */
 -        if (eos_mark_found)
 -        {
 -            argj++;
 -            continue;
 -        }
 -        else if (!strcmp(*argj, "--"))
 -        {
 -            eos_mark_found = 1;
 -            continue;
 -        }
 -
 -        if (0);
 -        else if (arg_match(&arg, &outputfile, argi))
 -            config->out_fn = arg.val;
 -        else if (arg_match(&arg, &fpf_name, argi))
 -            config->stats_fn = arg.val;
 -        else if (arg_match(&arg, &use_ivf, argi))
 -            config->write_webm = 0;
 -        else if (arg_match(&arg, &threads, argi))
 -            config->cfg.g_threads = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &profile, argi))
 -            config->cfg.g_profile = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &width, argi))
 -            config->cfg.g_w = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &height, argi))
 -            config->cfg.g_h = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &stereo_mode, argi))
 -            config->stereo_fmt = arg_parse_enum_or_int(&arg);
 -        else if (arg_match(&arg, &timebase, argi))
 -        {
 -            config->cfg.g_timebase = arg_parse_rational(&arg);
 -            validate_positive_rational(arg.name, &config->cfg.g_timebase);
 -        }
 -        else if (arg_match(&arg, &error_resilient, argi))
 -            config->cfg.g_error_resilient = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &lag_in_frames, argi))
 -            config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &dropframe_thresh, argi))
 -            config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &resize_allowed, argi))
 -            config->cfg.rc_resize_allowed = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &resize_up_thresh, argi))
 -            config->cfg.rc_resize_up_thresh = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &resize_down_thresh, argi))
 -            config->cfg.rc_resize_down_thresh = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &end_usage, argi))
 -            config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg);
 -        else if (arg_match(&arg, &target_bitrate, argi))
 -            config->cfg.rc_target_bitrate = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &min_quantizer, argi))
 -            config->cfg.rc_min_quantizer = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &max_quantizer, argi))
 -            config->cfg.rc_max_quantizer = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &undershoot_pct, argi))
 -            config->cfg.rc_undershoot_pct = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &overshoot_pct, argi))
 -            config->cfg.rc_overshoot_pct = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &buf_sz, argi))
 -            config->cfg.rc_buf_sz = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &buf_initial_sz, argi))
 -            config->cfg.rc_buf_initial_sz = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &buf_optimal_sz, argi))
 -            config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &bias_pct, argi))
 -        {
 -            config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
 -
 -            if (global->passes < 2)
 -                warn("option %s ignored in one-pass mode.\n", arg.name);
 -        }
 -        else if (arg_match(&arg, &minsection_pct, argi))
 -        {
 -            config->cfg.rc_2pass_vbr_minsection_pct = arg_parse_uint(&arg);
 -
 -            if (global->passes < 2)
 -                warn("option %s ignored in one-pass mode.\n", arg.name);
 -        }
 -        else if (arg_match(&arg, &maxsection_pct, argi))
 -        {
 -            config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg);
 +    if (0);
 +    else if (arg_match(&arg, &outputfile, argi))
 +      config->out_fn = arg.val;
 +    else if (arg_match(&arg, &fpf_name, argi))
 +      config->stats_fn = arg.val;
 +    else if (arg_match(&arg, &use_ivf, argi))
 +      config->write_webm = 0;
 +    else if (arg_match(&arg, &threads, argi))
 +      config->cfg.g_threads = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &profile, argi))
 +      config->cfg.g_profile = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &width, argi))
 +      config->cfg.g_w = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &height, argi))
 +      config->cfg.g_h = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &stereo_mode, argi))
 +      config->stereo_fmt = arg_parse_enum_or_int(&arg);
 +    else if (arg_match(&arg, &timebase, argi)) {
 +      config->cfg.g_timebase = arg_parse_rational(&arg);
 +      validate_positive_rational(arg.name, &config->cfg.g_timebase);
 +    } else if (arg_match(&arg, &error_resilient, argi))
 +      config->cfg.g_error_resilient = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &lag_in_frames, argi))
 +      config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &dropframe_thresh, argi))
 +      config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &resize_allowed, argi))
 +      config->cfg.rc_resize_allowed = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &resize_up_thresh, argi))
 +      config->cfg.rc_resize_up_thresh = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &resize_down_thresh, argi))
 +      config->cfg.rc_resize_down_thresh = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &end_usage, argi))
 +      config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg);
 +    else if (arg_match(&arg, &target_bitrate, argi))
 +      config->cfg.rc_target_bitrate = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &min_quantizer, argi))
 +      config->cfg.rc_min_quantizer = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &max_quantizer, argi))
 +      config->cfg.rc_max_quantizer = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &undershoot_pct, argi))
 +      config->cfg.rc_undershoot_pct = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &overshoot_pct, argi))
 +      config->cfg.rc_overshoot_pct = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &buf_sz, argi))
 +      config->cfg.rc_buf_sz = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &buf_initial_sz, argi))
 +      config->cfg.rc_buf_initial_sz = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &buf_optimal_sz, argi))
 +      config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &bias_pct, argi)) {
 +      config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
 +
 +      if (global->passes < 2)
 +        warn("option %s ignored in one-pass mode.\n", arg.name);
 +    } else if (arg_match(&arg, &minsection_pct, argi)) {
 +      config->cfg.rc_2pass_vbr_minsection_pct = arg_parse_uint(&arg);
 +
 +      if (global->passes < 2)
 +        warn("option %s ignored in one-pass mode.\n", arg.name);
 +    } else if (arg_match(&arg, &maxsection_pct, argi)) {
 +      config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg);
 +
 +      if (global->passes < 2)
 +        warn("option %s ignored in one-pass mode.\n", arg.name);
 +    } else if (arg_match(&arg, &kf_min_dist, argi))
 +      config->cfg.kf_min_dist = arg_parse_uint(&arg);
 +    else if (arg_match(&arg, &kf_max_dist, argi)) {
 +      config->cfg.kf_max_dist = arg_parse_uint(&arg);
 +      config->have_kf_max_dist = 1;
 +    } else if (arg_match(&arg, &kf_disabled, argi))
 +      config->cfg.kf_mode = VPX_KF_DISABLED;
 +    else {
 +      int i, match = 0;
 +
 +      for (i = 0; ctrl_args[i]; i++) {
 +        if (arg_match(&arg, ctrl_args[i], argi)) {
 +          int j;
 +          match = 1;
 +
 +          /* Point either to the next free element or the first
 +          * instance of this control.
 +          */
 +          for (j = 0; j < config->arg_ctrl_cnt; j++)
 +            if (config->arg_ctrls[j][0] == ctrl_args_map[i])
 +              break;
 +
 +          /* Update/insert */
 +          assert(j < ARG_CTRL_CNT_MAX);
 +          if (j < ARG_CTRL_CNT_MAX) {
 +            config->arg_ctrls[j][0] = ctrl_args_map[i];
 +            config->arg_ctrls[j][1] = arg_parse_enum_or_int(&arg);
 +            if (j == config->arg_ctrl_cnt)
 +              config->arg_ctrl_cnt++;
 +          }
  
 -            if (global->passes < 2)
 -                warn("option %s ignored in one-pass mode.\n", arg.name);
 -        }
 -        else if (arg_match(&arg, &kf_min_dist, argi))
 -            config->cfg.kf_min_dist = arg_parse_uint(&arg);
 -        else if (arg_match(&arg, &kf_max_dist, argi))
 -        {
 -            config->cfg.kf_max_dist = arg_parse_uint(&arg);
 -            config->have_kf_max_dist = 1;
          }
 -        else if (arg_match(&arg, &kf_disabled, argi))
 -            config->cfg.kf_mode = VPX_KF_DISABLED;
 -        else
 -        {
 -            int i, match = 0;
 -
 -            for (i = 0; ctrl_args[i]; i++)
 -            {
 -                if (arg_match(&arg, ctrl_args[i], argi))
 -                {
 -                    int j;
 -                    match = 1;
 -
 -                    /* Point either to the next free element or the first
 -                    * instance of this control.
 -                    */
 -                    for(j=0; j<config->arg_ctrl_cnt; j++)
 -                        if(config->arg_ctrls[j][0] == ctrl_args_map[i])
 -                            break;
 -
 -                    /* Update/insert */
 -                    assert(j < ARG_CTRL_CNT_MAX);
 -                    if (j < ARG_CTRL_CNT_MAX)
 -                    {
 -                        config->arg_ctrls[j][0] = ctrl_args_map[i];
 -                        config->arg_ctrls[j][1] = arg_parse_enum_or_int(&arg);
 -                        if(j == config->arg_ctrl_cnt)
 -                            config->arg_ctrl_cnt++;
 -                    }
 -
 -                }
 -            }
 +      }
  
 -            if (!match)
 -                argj++;
 -        }
 +      if (!match)
 +        argj++;
      }
 +  }
  
 -    return eos_mark_found;
 +  return eos_mark_found;
  }